/** * 将Entity的内容载入Page对象 * * @date 2013-1-7 上午11:22:06 * @param entity * @return * @throws Exception */ private Page load(HttpEntity entity) throws Exception { Page page = new Page(); // 设置返回内容的ContentType String contentType = null; Header type = entity.getContentType(); if (type != null) contentType = type.getValue(); page.setContentType(contentType); // 设置返回内容的字符编码 String contentEncoding = null; Header encoding = entity.getContentEncoding(); if (encoding != null) contentEncoding = encoding.getValue(); page.setEncoding(contentEncoding); // 设置返回内容的字符集 String contentCharset = EntityUtils.getContentCharSet(entity); page.setCharset(contentCharset); // 根据配置文件设置的字符集参数进行内容二进制话 String charset = config.getCharset(); String content = this.read(entity.getContent(), charset); page.setContent(content); // if (charset == null || charset.trim().length() == 0) // page.setContentData(content.getBytes()); // else // page.setContentData(content.getBytes(charset)); return page; }
/** * 抓取目标url的内容 * * @date 2013-1-7 上午11:08:54 * @param toFetchURL * @return */ public FetchResult fetch(FetchRequest req) throws Exception { if (req.getHttpMethod() != null && !Http.Method.GET.equals(req.getHttpMethod())) { // 获取到URL后面的QueryParam String query = new URL(req.getUrl()).getQuery(); for (String q : query.split("\\&")) { String[] qv = q.split("="); String name = qv[0]; String val = qv[1]; List<Object> vals = req.getParams().get(name); if (vals == null) { vals = new ArrayList<Object>(); req.getParams().put(name, vals); } vals.add(val); } return request(req); } FetchResult fetchResult = new FetchResult(); HttpGet get = null; HttpEntity entity = null; String toFetchURL = req.getUrl(); try { get = new HttpGet(toFetchURL); // 设置请求GZIP压缩,注意,前面必须设置GZIP解压缩处理 get.addHeader("Accept-Encoding", "gzip"); for (Iterator<Entry<String, String>> it = headers.entrySet().iterator(); it.hasNext(); ) { Entry<String, String> entry = it.next(); get.addHeader(entry.getKey(), entry.getValue()); } // 同步信号量,在真正对服务端进行访问之前进行访问间隔的控制 // TODO 针对每个请求有一个delay的参数设置 synchronized (mutex) { // 获取当前时间 long now = (new Date()).getTime(); // 对同一个Host抓取时间间隔进行控制,若在设置的时限内则进行休眠 if (now - lastFetchTime < config.getPolitenessDelay()) Thread.sleep(config.getPolitenessDelay() - (now - lastFetchTime)); // 不断更新最后的抓取时间,注意,是针对HOST的,不是针对某个URL的 lastFetchTime = (new Date()).getTime(); } // 记录get请求信息 Header[] headers = get.getAllHeaders(); for (Header h : headers) { Map<String, List<String>> hs = req.getHeaders(); String key = h.getName(); List<String> val = hs.get(key); if (val == null) val = new ArrayList<String>(); val.add(h.getValue()); hs.put(key, val); } req.getCookies().putAll(this.cookies); fetchResult.setReq(req); // 执行get访问,获取服务端返回内容 HttpResponse response = httpClient.execute(get); headers = response.getAllHeaders(); for (Header h : headers) { Map<String, List<String>> hs = fetchResult.getHeaders(); String key = h.getName(); List<String> val = hs.get(key); if (val == null) val = new ArrayList<String>(); val.add(h.getValue()); hs.put(key, val); } // 设置已访问URL fetchResult.setFetchedUrl(toFetchURL); String uri = get.getURI().toString(); if (!uri.equals(toFetchURL)) if (!URLCanonicalizer.getCanonicalURL(uri).equals(toFetchURL)) fetchResult.setFetchedUrl(uri); entity = response.getEntity(); // 服务端返回的状态码 int statusCode = response.getStatusLine().getStatusCode(); if (statusCode != HttpStatus.SC_OK) { if (statusCode != HttpStatus.SC_NOT_FOUND) { Header locationHeader = response.getFirstHeader("Location"); // 如果是301、302跳转,获取跳转URL即可返回 if (locationHeader != null && (statusCode == HttpStatus.SC_MOVED_PERMANENTLY || statusCode == HttpStatus.SC_MOVED_TEMPORARILY)) fetchResult.setMovedToUrl( URLCanonicalizer.getCanonicalURL(locationHeader.getValue(), toFetchURL)); } // 只要不是OK的除了设置跳转URL外设置statusCode即可返回 // 判断是否有忽略状态码的设置 if (this.site.getSkipStatusCode() != null && this.site.getSkipStatusCode().trim().length() > 0) { String[] scs = this.site.getSkipStatusCode().split(","); for (String code : scs) { int c = CommonUtil.toInt(code); // 忽略此状态码,依然解析entity if (statusCode == c) { assemPage(fetchResult, entity); break; } } } fetchResult.setStatusCode(statusCode); return fetchResult; } // 处理服务端返回的实体内容 if (entity != null) { fetchResult.setStatusCode(statusCode); assemPage(fetchResult, entity); return fetchResult; } } catch (Throwable e) { fetchResult.setFetchedUrl(e.toString()); fetchResult.setStatusCode(Status.INTERNAL_SERVER_ERROR.ordinal()); return fetchResult; } finally { try { if (entity == null && get != null) get.abort(); } catch (Exception e) { throw e; } } fetchResult.setStatusCode(Status.UNSPECIFIED_ERROR.ordinal()); return fetchResult; }
/** * 请求 * * @date 2013-1-7 上午11:08:54 * @param toFetchURL * @return */ public FetchResult request(FetchRequest req) throws Exception { FetchResult fetchResult = new FetchResult(); HttpUriRequest request = null; HttpEntity entity = null; String toFetchURL = req.getUrl(); boolean isPost = false; try { if (Http.Method.GET.equalsIgnoreCase(req.getHttpMethod())) request = new HttpGet(toFetchURL); else if (Http.Method.POST.equalsIgnoreCase(req.getHttpMethod())) { request = new HttpPost(toFetchURL); isPost = true; } else if (Http.Method.PUT.equalsIgnoreCase(req.getHttpMethod())) request = new HttpPut(toFetchURL); else if (Http.Method.HEAD.equalsIgnoreCase(req.getHttpMethod())) request = new HttpHead(toFetchURL); else if (Http.Method.OPTIONS.equalsIgnoreCase(req.getHttpMethod())) request = new HttpOptions(toFetchURL); else if (Http.Method.DELETE.equalsIgnoreCase(req.getHttpMethod())) request = new HttpDelete(toFetchURL); else throw new Exception("Unknown http method name"); // 同步信号量,在真正对服务端进行访问之前进行访问间隔的控制 // TODO 针对每个请求有一个delay的参数设置 synchronized (mutex) { // 获取当前时间 long now = (new Date()).getTime(); // 对同一个Host抓取时间间隔进行控制,若在设置的时限内则进行休眠 if (now - lastFetchTime < config.getPolitenessDelay()) Thread.sleep(config.getPolitenessDelay() - (now - lastFetchTime)); // 不断更新最后的抓取时间,注意,是针对HOST的,不是针对某个URL的 lastFetchTime = (new Date()).getTime(); } // 设置请求GZIP压缩,注意,前面必须设置GZIP解压缩处理 request.addHeader("Accept-Encoding", "gzip"); for (Iterator<Entry<String, String>> it = headers.entrySet().iterator(); it.hasNext(); ) { Entry<String, String> entry = it.next(); request.addHeader(entry.getKey(), entry.getValue()); } // 记录请求信息 Header[] headers = request.getAllHeaders(); for (Header h : headers) { Map<String, List<String>> hs = req.getHeaders(); String key = h.getName(); List<String> val = hs.get(key); if (val == null) val = new ArrayList<String>(); val.add(h.getValue()); hs.put(key, val); } req.getCookies().putAll(this.cookies); fetchResult.setReq(req); HttpEntity reqEntity = null; if (Http.Method.POST.equalsIgnoreCase(req.getHttpMethod()) || Http.Method.PUT.equalsIgnoreCase(req.getHttpMethod())) { if (!req.getFiles().isEmpty()) { reqEntity = new MultipartEntity(HttpMultipartMode.BROWSER_COMPATIBLE); for (Iterator<Entry<String, List<File>>> it = req.getFiles().entrySet().iterator(); it.hasNext(); ) { Entry<String, List<File>> e = it.next(); String paramName = e.getKey(); for (File file : e.getValue()) { // For File parameters ((MultipartEntity) reqEntity).addPart(paramName, new FileBody(file)); } } for (Iterator<Entry<String, List<Object>>> it = req.getParams().entrySet().iterator(); it.hasNext(); ) { Entry<String, List<Object>> e = it.next(); String paramName = e.getKey(); for (Object paramValue : e.getValue()) { // For usual String parameters ((MultipartEntity) reqEntity) .addPart( paramName, new StringBody( String.valueOf(paramValue), "text/plain", Charset.forName("UTF-8"))); } } } else { List<NameValuePair> params = new ArrayList<NameValuePair>(req.getParams().size()); for (Iterator<Entry<String, List<Object>>> it = req.getParams().entrySet().iterator(); it.hasNext(); ) { Entry<String, List<Object>> e = it.next(); String paramName = e.getKey(); for (Object paramValue : e.getValue()) { params.add(new BasicNameValuePair(paramName, String.valueOf(paramValue))); } } reqEntity = new UrlEncodedFormEntity(params, HTTP.UTF_8); } if (isPost) ((HttpPost) request).setEntity(reqEntity); else ((HttpPut) request).setEntity(reqEntity); } // 执行请求,获取服务端返回内容 HttpResponse response = httpClient.execute(request); headers = response.getAllHeaders(); for (Header h : headers) { Map<String, List<String>> hs = fetchResult.getHeaders(); String key = h.getName(); List<String> val = hs.get(key); if (val == null) val = new ArrayList<String>(); val.add(h.getValue()); hs.put(key, val); } // 设置已访问URL fetchResult.setFetchedUrl(toFetchURL); String uri = request.getURI().toString(); if (!uri.equals(toFetchURL)) if (!URLCanonicalizer.getCanonicalURL(uri).equals(toFetchURL)) fetchResult.setFetchedUrl(uri); entity = response.getEntity(); // 服务端返回的状态码 int statusCode = response.getStatusLine().getStatusCode(); if (statusCode != HttpStatus.SC_OK) { if (statusCode != HttpStatus.SC_NOT_FOUND) { Header locationHeader = response.getFirstHeader("Location"); // 如果是301、302跳转,获取跳转URL即可返回 if (locationHeader != null && (statusCode == HttpStatus.SC_MOVED_PERMANENTLY || statusCode == HttpStatus.SC_MOVED_TEMPORARILY)) fetchResult.setMovedToUrl( URLCanonicalizer.getCanonicalURL(locationHeader.getValue(), toFetchURL)); } // 只要不是OK的除了设置跳转URL外设置statusCode即可返回 // 判断是否有忽略状态码的设置 if (this.site.getSkipStatusCode() != null && this.site.getSkipStatusCode().trim().length() > 0) { String[] scs = this.site.getSkipStatusCode().split(","); for (String code : scs) { int c = CommonUtil.toInt(code); // 忽略此状态码,依然解析entity if (statusCode == c) { assemPage(fetchResult, entity); break; } } } fetchResult.setStatusCode(statusCode); return fetchResult; } // 处理服务端返回的实体内容 if (entity != null) { fetchResult.setStatusCode(statusCode); assemPage(fetchResult, entity); return fetchResult; } } catch (Throwable e) { fetchResult.setFetchedUrl(e.toString()); fetchResult.setStatusCode(Status.INTERNAL_SERVER_ERROR.ordinal()); return fetchResult; } finally { try { if (entity == null && request != null) request.abort(); } catch (Exception e) { throw e; } } fetchResult.setStatusCode(Status.UNSPECIFIED_ERROR.ordinal()); return fetchResult; }
/** * 构造器,进行client的参数设置,包括Header、Cookie等 * * @param aconfig * @param cookies */ public void init(SpiderConfig config, Site _site) { this.config = config; // 设置HTTP参数 HttpParams params = new BasicHttpParams(); params.setParameter(CoreProtocolPNames.USER_AGENT, config.getUserAgentString()); params.setIntParameter(CoreConnectionPNames.SO_TIMEOUT, config.getSocketTimeout()); params.setIntParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, config.getConnectionTimeout()); HttpProtocolParamBean paramsBean = new HttpProtocolParamBean(params); paramsBean.setVersion(HttpVersion.HTTP_1_1); paramsBean.setContentCharset("UTF-8"); paramsBean.setUseExpectContinue(false); SchemeRegistry schemeRegistry = new SchemeRegistry(); schemeRegistry.register(new Scheme("http", 80, PlainSocketFactory.getSocketFactory())); if (config.isIncludeHttpsPages()) schemeRegistry.register(new Scheme("https", 443, SSLSocketFactory.getSocketFactory())); connectionManager = new ThreadSafeClientConnManager(schemeRegistry); connectionManager.setMaxTotal(config.getMaxTotalConnections()); connectionManager.setDefaultMaxPerRoute(config.getMaxConnectionsPerHost()); httpClient = new DefaultHttpClient(connectionManager, params); httpClient.getParams().setIntParameter("http.socket.timeout", 60000); httpClient.getParams().setParameter(ClientPNames.COOKIE_POLICY, CookiePolicy.BEST_MATCH); httpClient.getParams().setParameter(ClientPNames.HANDLE_REDIRECTS, config.isFollowRedirects()); // HttpClientParams.setCookiePolicy(httpClient.getParams(),CookiePolicy.BEST_MATCH); // 设置响应拦截器 httpClient.addResponseInterceptor( new HttpResponseInterceptor() { public void process(final HttpResponse response, final HttpContext context) throws HttpException, IOException { HttpEntity entity = response.getEntity(); Header contentEncoding = entity.getContentEncoding(); if (contentEncoding != null) { HeaderElement[] codecs = contentEncoding.getElements(); for (HeaderElement codec : codecs) { // 处理GZIP解压缩 if (codec.getName().equalsIgnoreCase("gzip")) { response.setEntity(new GzipDecompressingEntity(response.getEntity())); return; } } } } }); if (_site != null) { this.site = _site; if (this.site.getHeaders() != null && this.site.getHeaders().getHeader() != null) { for (org.eweb4j.spiderman.xml.Header header : this.site.getHeaders().getHeader()) { this.addHeader(header.getName(), header.getValue()); } } if (this.site.getCookies() != null && this.site.getCookies().getCookie() != null) { for (org.eweb4j.spiderman.xml.Cookie cookie : this.site.getCookies().getCookie()) { this.addCookie(cookie.getName(), cookie.getValue(), cookie.getHost(), cookie.getPath()); } } } }