/** * Предполагается, что на этапе запуска приожения, если с MainUrl что-то не так то контейнер не * запустится :-) */ @PostConstruct public void init() { try { mainURI = new URI(properties.getProperty("MainUrl"), true, "UTF-8"); } catch (Exception e) { throw new RuntimeException(ERR_MSG + e.getMessage()); } if (!mainURI.isAbsoluteURI()) { throw new RuntimeException(ERR_MSG + "URI not absolute path"); } try { String mainHost = mainURI.getHost(); if (mainHost == null) { throw new RuntimeException(ERR_MSG + "bad URI host"); } } catch (URIException e) { throw new RuntimeException(ERR_MSG + e.getMessage()); } try { secureURI = new URI( properties.getProperty("SecureUrl", mainURI.toString().replaceFirst("http", "https")), true, "UTF-8"); } catch (Exception e) { throw new RuntimeException(ERR_MSG + e.getMessage()); } }
private boolean isValidMessageToScan(HttpMessage msg) { if (getScannerOptions().isScanHeadersAllRequests()) { return true; } // First we check if it's a dynamic or static page // I'd to do this because scanning starts to be veeeeery slow // -- // this is a trivial implementation, should be good to have // a page dynamic check at the parent plugin level which should // use or not Variants according to the behavior of the request // (e.g. different content or status error/redirect) String query = null; try { query = msg.getRequestHeader().getURI().getQuery(); } catch (URIException e) { log.error(e.getMessage(), e); } // If there's almost one GET parameter go ahead if (query == null || query.isEmpty()) { // If also the Request body is null maybe it's a static page oer a null parameter page if (msg.getRequestBody().length() == 0) { return false; } } return true; }
/** * 获取网络图片 * * @param url * @return */ public static Bitmap getBitmapByNet(String url) throws AppException { // System.out.println("image_url==> "+url); URI uri = null; try { uri = new URI(url, false, "UTF-8"); } catch (URIException e) { e.printStackTrace(); } if (uri != null) url = uri.toString(); HttpClient httpClient = null; GetMethod httpGet = null; Bitmap bitmap = null; int time = 0; do { try { httpClient = HttpHelper.getHttpClient(); httpGet = HttpHelper.getHttpGet(url, HttpHelper.getUserAgent()); int statusCode = httpClient.executeMethod(httpGet); if (statusCode != HttpStatus.SC_OK) { throw AppException.http(statusCode); } InputStream inStream = httpGet.getResponseBodyAsStream(); bitmap = BitmapFactory.decodeStream(inStream); inStream.close(); break; } catch (HttpException e) { time++; if (time < RETRY_TIME) { try { Thread.sleep(1000); } catch (InterruptedException e1) { } continue; } // 发生致命的异常,可能是协议不对或者返回的内容有问题 e.printStackTrace(); throw AppException.http(e); } catch (IOException e) { time++; if (time < RETRY_TIME) { try { Thread.sleep(1000); } catch (InterruptedException e1) { } continue; } // 发生网络异常 e.printStackTrace(); throw AppException.network(e); } finally { // 释放连接 httpGet.releaseConnection(); } } while (time < RETRY_TIME); return bitmap; }
/** * parameter와 queryString 를 가져온다. * * @param method * @return */ private String getHttpInfoDumy(HttpMethod method) { NameValuePair[] params = null; String methodType = "GET"; String reqBody = null; if (method instanceof PostMethod) { params = ((PostMethod) method).getParameters(); methodType = "POST"; StringRequestEntity sre = (StringRequestEntity) ((PostMethod) method).getRequestEntity(); reqBody = sre.getContent(); } StringBuffer sb = new StringBuffer(); try { sb.append("#### getHttpInfoDumy ####"); sb.append("\n## " + methodType + " [" + method.getURI() + "], hscd[" + this.hashCode() + "]"); } catch (URIException e) { sb.append("\n## getParamsQueryStr- URIException " + e.getMessage() + "]"); return sb.toString(); } if (method.getQueryString() != null && method.getQueryString().length() > 0) sb.append("\n" + "## queryString[" + method.getQueryString() + "]"); if (params != null) { for (int i = 0; i < params.length; i++) { NameValuePair param = params[i]; sb.append( "\n" + "## POST body param[" + i + "], name[" + param.getName() + "], value[" + param.getValue() + "]"); } } if (reqBody != null) { sb.append("\n" + "## POST body String [" + reqBody + "]"); } sb.append("\n##########"); return sb.toString(); }
public boolean populate(CrawlURI curi, HttpClient http, HttpMethod method, String payload) { // http is not used. // payload is not used. boolean result = false; Map formItems = null; try { formItems = getFormItems(curi); } catch (AttributeNotFoundException e1) { logger.severe("Failed get of form items for " + curi); } if (formItems == null || formItems.size() <= 0) { try { logger.severe("No form items for " + method.getURI()); } catch (URIException e) { logger.severe("No form items and exception getting uri: " + e.getMessage()); } return result; } NameValuePair[] data = new NameValuePair[formItems.size()]; int index = 0; String key = null; for (Iterator i = formItems.keySet().iterator(); i.hasNext(); ) { key = (String) i.next(); data[index++] = new NameValuePair(key, (String) formItems.get(key)); } if (method instanceof PostMethod) { ((PostMethod) method).setRequestBody(data); result = true; } else if (method instanceof GetMethod) { // Append these values to the query string. // Get current query string, then add data, then get it again // only this time its our data only... then append. HttpMethodBase hmb = (HttpMethodBase) method; String currentQuery = hmb.getQueryString(); hmb.setQueryString(data); String newQuery = hmb.getQueryString(); hmb.setQueryString(((currentQuery != null) ? currentQuery : "") + "&" + newQuery); result = true; } else { logger.severe("Unknown method type: " + method); } return result; }
public boolean isPrerequisite(final CrawlURI curi) { boolean result = false; String curiStr = curi.getUURI().toString(); String loginUri = getPrerequisite(curi); if (loginUri != null) { try { UURI uuri = UURIFactory.getInstance(curi.getUURI(), loginUri); if (uuri != null && curiStr != null && uuri.toString().equals(curiStr)) { result = true; if (!curi.isPrerequisite()) { curi.setPrerequisite(true); logger.fine(curi + " is prereq."); } } } catch (URIException e) { logger.severe("Failed to uuri: " + curi + ", " + e.getMessage()); } } return result; }
protected void addHeaderLink(CrawlURI curi, Header loc) { if (loc == null) { // If null, return without adding anything. return; } // TODO: consider possibility of multiple headers try { /** * 302重定向使用自定义的方法存储link * * @modify: wuliufu * @since : 2012-05-11 */ curi.createAndAddLocationLink( curi.getVia(), loc.getValue(), loc.getName() + ":", Link.REFER_HOP); if (curi.getObject(URLInfo.ATTACH) != null) { UURI outUURI = UURIFactory.getInstance(curi.getUURI(), loc.getValue()); logger.debug( "ParseHTTP: curi = " + curi.getUURI().toString() + "&& " + loc.getName() + "=" + outUURI.toString()); curi.putObject(outUURI.toString(), curi.getObject(URLInfo.ATTACH)); } numberOfLinksExtracted++; } catch (URIException e) { // There may not be a controller (e.g. If we're being run // by the extractor tool). if (getController() != null) { getController().logUriError(e, curi.getUURI(), loc.getValue()); } else { logger.info(curi + ", " + loc.getValue() + ": " + e.getMessage()); } } }
/** Run method of the thread */ public void run() { queue = manager.workQueue; while (manager.hasWorkLeft()) { working = false; // code to make the worker pause, if the pause button has been presed // if the stop signal has been given stop the thread if (stop) { return; } // this pasuses the thread synchronized (this) { while (pleaseWait) { try { wait(); } catch (InterruptedException e) { return; } catch (Exception e) { e.printStackTrace(); } } } GetMethod httpget = null; HeadMethod httphead = null; try { work = (WorkUnit) queue.take(); working = true; url = work.getWork(); int code = 0; String responce = ""; String rawResponce = ""; // if the work is a head request if (work.getMethod().equalsIgnoreCase("HEAD")) { if (Config.debug) { System.out.println("DEBUG Worker[" + threadId + "]: HEAD " + url.toString()); } httphead = new HeadMethod(url.toString()); // set the custom HTTP headers Vector HTTPheaders = manager.getHTTPHeaders(); for (int a = 0; a < HTTPheaders.size(); a++) { HTTPHeader httpHeader = (HTTPHeader) HTTPheaders.elementAt(a); /* * Host header has to be set in a different way! */ if (httpHeader.getHeader().startsWith("Host")) { httphead.getParams().setVirtualHost(httpHeader.getValue()); } else { httphead.setRequestHeader(httpHeader.getHeader(), httpHeader.getValue()); } } httphead.setFollowRedirects(Config.followRedirects); /* * this code is used to limit the number of request/sec */ if (manager.isLimitRequests()) { while (manager.getTotalDone() / ((System.currentTimeMillis() - manager.getTimestarted()) / 1000.0) > manager.getLimitRequestsTo()) { Thread.sleep(100); } } /* * Send the head request */ code = httpclient.executeMethod(httphead); if (Config.debug) { System.out.println("DEBUG Worker[" + threadId + "]: " + code + " " + url.toString()); } httphead.releaseConnection(); } // if we are doing a get request else if (work.getMethod().equalsIgnoreCase("GET")) { // make the request; if (Config.debug) { System.out.println("DEBUG Worker[" + threadId + "]: GET " + url.toString()); } httpget = new GetMethod(url.toString()); // set the custom HTTP headers Vector HTTPheaders = manager.getHTTPHeaders(); for (int a = 0; a < HTTPheaders.size(); a++) { HTTPHeader httpHeader = (HTTPHeader) HTTPheaders.elementAt(a); /* * Host header has to be set in a different way! */ if (httpHeader.getHeader().startsWith("Host")) { httpget.getParams().setVirtualHost(httpHeader.getValue()); } else { httpget.setRequestHeader(httpHeader.getHeader(), httpHeader.getValue()); } } httpget.setFollowRedirects(Config.followRedirects); /* * this code is used to limit the number of request/sec */ if (manager.isLimitRequests()) { while (manager.getTotalDone() / ((System.currentTimeMillis() - manager.getTimestarted()) / 1000.0) > manager.getLimitRequestsTo()) { Thread.sleep(100); } } code = httpclient.executeMethod(httpget); if (Config.debug) { System.out.println("DEBUG Worker[" + threadId + "]: " + code + " " + url.toString()); } // set up the input stream BufferedReader input = new BufferedReader(new InputStreamReader(httpget.getResponseBodyAsStream())); // save the headers into a string, used in viewing raw responce String rawHeader; rawHeader = httpget.getStatusLine() + "\r\n"; Header headers[] = httpget.getResponseHeaders(); StringBuffer buf = new StringBuffer(); for (int a = 0; a < headers.length; a++) { buf.append(headers[a].getName() + ": " + headers[a].getValue() + "\r\n"); } rawHeader = rawHeader + buf.toString(); buf = new StringBuffer(); // read in the responce body String line; while ((line = input.readLine()) != null) { buf.append("\r\n" + line); } responce = buf.toString(); input.close(); rawResponce = rawHeader + responce; // clean the responce // parse the html of what we have found if (Config.parseHTML && !work.getBaseCaseObj().isUseRegexInstead()) { Header contentType = httpget.getResponseHeader("Content-Type"); if (contentType != null) { if (contentType.getValue().startsWith("text")) { manager.addHTMLToParseQueue(new HTMLparseWorkUnit(responce, work)); } } } responce = FilterResponce.CleanResponce(responce, work); Thread.sleep(10); httpget.releaseConnection(); } else { // There is no need to deal with requests other than HEAD or GET } // if we need to check the against the base case if (work.getMethod().equalsIgnoreCase("GET") && work.getBaseCaseObj().useContentAnalysisMode()) { if (code == 200) { if (Config.debug) { System.out.println( "DEBUG Worker[" + threadId + "]: Base Case Check " + url.toString()); } // TODO move this option to the Adv options // if the responce does not match the base case Pattern regexFindFile = Pattern.compile(".*file not found.*", Pattern.CASE_INSENSITIVE); Matcher m = regexFindFile.matcher(responce); // need to clean the base case of the item we are looking for String basecase = FilterResponce.removeItemCheckedFor( work.getBaseCaseObj().getBaseCase(), work.getItemToCheck()); if (m.find()) { // do nothing as we have a 404 } else if (!responce.equalsIgnoreCase(basecase)) { if (work.isDir()) { if (Config.debug) { System.out.println( "DEBUG Worker[" + threadId + "]: Found Dir (base case)" + url.toString()); } // we found a dir manager.foundDir(url, code, responce, basecase, rawResponce, work.getBaseCaseObj()); } else { // found a file if (Config.debug) { System.out.println( "DEBUG Worker[" + threadId + "]: Found File (base case)" + url.toString()); } manager.foundFile( url, code, responce, work.getBaseCaseObj().getBaseCase(), rawResponce, work.getBaseCaseObj()); } } } else if (code == 404 || code == 400) { // again do nothing as it is not there } else { if (work.isDir()) { if (Config.debug) { System.out.println( "DEBUG Worker[" + threadId + "]: Found Dir (base case)" + url.toString()); } // we found a dir manager.foundDir( url, code, responce, work.getBaseCaseObj().getBaseCase(), rawResponce, work.getBaseCaseObj()); } else { // found a file if (Config.debug) { System.out.println( "DEBUG Worker[" + threadId + "]: Found File (base case)" + url.toString()); } manager.foundFile( url, code, responce, work.getBaseCaseObj().getBaseCase(), rawResponce, work.getBaseCaseObj()); } // manager.foundError(url, "Base Case Mode Error - Responce code came back as " + code + // " it should have been 200"); // manager.workDone(); } } /* * use the custom regex check instead */ else if (work.getBaseCaseObj().isUseRegexInstead()) { Pattern regexFindFile = Pattern.compile(work.getBaseCaseObj().getRegex()); Matcher m = regexFindFile.matcher(rawResponce); /* System.out.println("======Trying to find======"); System.out.println(work.getBaseCaseObj().getRegex()); System.out.println("======In======"); System.out.println(responce); System.out.println("======/In======"); */ if (m.find()) { // do nothing as we have a 404 if (Config.debug) { System.out.println( "DEBUG Worker[" + threadId + "]: Regex matched so it's a 404, " + url.toString()); } } else { if (Config.parseHTML) { Header contentType = httpget.getResponseHeader("Content-Type"); if (contentType != null) { if (contentType.getValue().startsWith("text")) { manager.addHTMLToParseQueue(new HTMLparseWorkUnit(rawResponce, work)); } } } if (work.isDir()) { if (Config.debug) { System.out.println( "DEBUG Worker[" + threadId + "]: Found Dir (regex) " + url.toString()); } // we found a dir manager.foundDir( url, code, responce, work.getBaseCaseObj().getBaseCase(), rawResponce, work.getBaseCaseObj()); } else { // found a file if (Config.debug) { System.out.println( "DEBUG Worker[" + threadId + "]: Found File (regex) " + url.toString()); } manager.foundFile( url, code, responce, work.getBaseCaseObj().getBaseCase(), rawResponce, work.getBaseCaseObj()); } // manager.foundError(url, "Base Case Mode Error - Responce code came back as " + code + // " it should have been 200"); // manager.workDone(); } } // just check the responce code else { // if is not the fail code, a 404 or a 400 then we have a possible if (code != work.getBaseCaseObj().getFailCode() && code != 404 && code != 0 && code != 400) { if (work.getMethod().equalsIgnoreCase("HEAD")) { if (Config.debug) { System.out.println( "DEBUG Worker[" + threadId + "]: Getting responce via GET " + url.toString()); } rawResponce = ""; httpget = new GetMethod(url.toString()); Vector HTTPheaders = manager.getHTTPHeaders(); for (int a = 0; a < HTTPheaders.size(); a++) { HTTPHeader httpHeader = (HTTPHeader) HTTPheaders.elementAt(a); httpget.setRequestHeader(httpHeader.getHeader(), httpHeader.getValue()); } httpget.setFollowRedirects(Config.followRedirects); /* * this code is used to limit the number of request/sec */ if (manager.isLimitRequests()) { while (manager.getTotalDone() / ((System.currentTimeMillis() - manager.getTimestarted()) / 1000.0) > manager.getLimitRequestsTo()) { Thread.sleep(100); } } int newCode = httpclient.executeMethod(httpget); // in some cases the second get can return a different result, than the first head // request! if (newCode != code) { manager.foundError( url, "Return code for first HEAD, is different to the second GET: " + code + " - " + newCode); } rawResponce = ""; // build a string version of the headers rawResponce = httpget.getStatusLine() + "\r\n"; Header headers[] = httpget.getResponseHeaders(); StringBuffer buf = new StringBuffer(); for (int a = 0; a < headers.length; a++) { buf.append(headers[a].getName() + ": " + headers[a].getValue() + "\r\n"); } buf.append("\r\n"); rawResponce = rawResponce + buf.toString(); if (httpget.getResponseContentLength() > 0) { // get the http body BufferedReader input = new BufferedReader(new InputStreamReader(httpget.getResponseBodyAsStream())); String line; String tempResponce = ""; buf = new StringBuffer(); while ((line = input.readLine()) != null) { buf.append("\r\n" + line); } tempResponce = buf.toString(); input.close(); rawResponce = rawResponce + tempResponce; Header contentType = httpget.getResponseHeader("Content-Type"); if (Config.parseHTML) { contentType = httpget.getResponseHeader("Content-Type"); if (contentType != null) { if (contentType.getValue().startsWith("text")) { manager.addHTMLToParseQueue(new HTMLparseWorkUnit(tempResponce, work)); } } } } httpget.releaseConnection(); } if (work.isDir()) { manager.foundDir(url, code, rawResponce, work.getBaseCaseObj()); } else { manager.foundFile(url, code, rawResponce, work.getBaseCaseObj()); } } } manager.workDone(); Thread.sleep(20); } catch (NoHttpResponseException e) { manager.foundError(url, "NoHttpResponseException " + e.getMessage()); manager.workDone(); } catch (ConnectTimeoutException e) { manager.foundError(url, "ConnectTimeoutException " + e.getMessage()); manager.workDone(); } catch (URIException e) { manager.foundError(url, "URIException " + e.getMessage()); manager.workDone(); } catch (IOException e) { manager.foundError(url, "IOException " + e.getMessage()); manager.workDone(); } catch (InterruptedException e) { // manager.foundError(url, "InterruptedException " + e.getMessage()); manager.workDone(); return; } catch (IllegalArgumentException e) { e.printStackTrace(); manager.foundError(url, "IllegalArgumentException " + e.getMessage()); manager.workDone(); } finally { if (httpget != null) { httpget.releaseConnection(); } if (httphead != null) { httphead.releaseConnection(); } } } }
/** @param args program arguments */ public static void main(String[] args) { AggressiveUrlCanonicalizer canonicalizer = new AggressiveUrlCanonicalizer(); int n = 0; int i = 0; ArrayList<Integer> columns = new ArrayList<Integer>(); long lineNumber = 0; boolean cdxPassThru = false; String delimiter = " "; while (n < args.length) { String arg = args[n]; if (arg.compareTo("-cdx") == 0) { cdxPassThru = true; n++; continue; } if (n == (args.length - 1)) { USAGE(); } String val = args[n + 1]; if (arg.compareTo("-f") == 0) { columns.add(new Integer(val)); } else if (arg.compareTo("-d") == 0) { delimiter = val; } else { USAGE(); } n += 2; } // place default '0' in case none specified: if (columns.size() == 0) { columns.add(new Integer(1)); } // convert to int[]: int[] cols = new int[columns.size()]; for (int idx = 0; idx < columns.size(); idx++) { cols[idx] = columns.get(idx).intValue() - 1; } BufferedReader r = new BufferedReader(new InputStreamReader(System.in, ByteOp.UTF8)); StringBuilder sb = new StringBuilder(); String line = null; while (true) { try { line = r.readLine(); } catch (IOException e) { e.printStackTrace(); System.exit(1); } if (line == null) { break; } lineNumber++; if (cdxPassThru && line.startsWith(CDX_PREFIX)) { System.out.println(line); continue; } String parts[] = line.split(delimiter); for (int column : cols) { if (column >= parts.length) { System.err.println("Invalid line " + lineNumber + " (" + line + ") skipped"); } else { try { parts[column] = canonicalizer.urlStringToKey(parts[column]); } catch (URIException e) { System.err.println( "Invalid URL in line " + lineNumber + " (" + line + ") skipped (" + parts[column] + ")"); e.printStackTrace(); continue; } catch (StringIndexOutOfBoundsException e) { System.err.println( "Invalid URL in line " + lineNumber + " (" + line + ") skipped (" + parts[column] + ")"); e.printStackTrace(); continue; } } } sb.setLength(0); for (i = 0; i < parts.length; i++) { sb.append(parts[i]); if (i < (parts.length - 1)) { sb.append(delimiter); } } System.out.println(sb.toString()); } }
public void getCdx(CDXQuery query, AuthToken authToken, CDXWriter responseWriter) throws IOException { CloseableIterator<String> iter = null; try { // Check for wildcards as shortcuts for matchType if (query.matchType == null) { if (query.url.startsWith("*.")) { query.matchType = MatchType.domain; query.url = query.url.substring(2); } else if (query.url.endsWith("*")) { query.matchType = MatchType.prefix; query.url = query.url.substring(0, query.url.length() - 1); } else { query.matchType = MatchType.exact; } } CDXAccessFilter accessChecker = null; if (!authChecker.isAllUrlAccessAllowed(authToken)) { accessChecker = authChecker.createAccessFilter(authToken); } // // For now, don't support domain or host output w/o key as access check is too slow // if (query.matchType == MatchType.domain || query.matchType == MatchType.host) { // if (!authChecker.isAllUrlAccessAllowed(authToken)) { // return; // } // } String startEndUrl[] = urlSurtRangeComputer.determineRange(query.url, query.matchType, "", ""); if (startEndUrl == null) { responseWriter.printError( "Sorry, matchType=" + query.matchType.name() + " is not supported by this server"); return; } if ((accessChecker != null) && !accessChecker.includeUrl(startEndUrl[0], query.url)) { if (query.showNumPages) { // Default to 1 page even if no results responseWriter.printNumPages(1, false); } return; } if (query.last || query.limit == -1) { query.limit = 1; query.setSort(SortType.reverse); } int maxLimit; if (query.fastLatest == null) { // Optimize: default fastLatest to true for last line or closest // sorted results if ((query.limit == -1) || (!query.closest.isEmpty() && (query.limit > 0))) { query.fastLatest = true; } else { query.fastLatest = false; } } // Paged query if (query.page >= 0 || query.showNumPages) { iter = createPagedCdxIterator(startEndUrl, query, authToken, responseWriter); if (iter == null) { return; } // Page size determines the max limit here maxLimit = Integer.MAX_VALUE; } else { // Non-Paged Merged query iter = createBoundedCdxIterator(startEndUrl, query, null, null); // TODO: apply collection-view filtering here. It should happen separately // from exclusion check. We'd need to parse CDX lines into CDXLine object // before passing it to writeCdxResponse(). Pass CDXFilter to getCdx()? // Pass CDX source object that escapsulates collection-view filtering? maxLimit = this.queryMaxLimit; } writeCdxResponse(responseWriter, iter, maxLimit, query, authToken, accessChecker); } catch (URIException e) { responseWriter.printError(e.toString()); } catch (URISyntaxException e) { responseWriter.printError(e.toString()); } finally { if (iter != null) { iter.close(); } } }