@Test public void test() throws GalimatiasParseException { final URLCanonicalizer canon = new DecodeUnreservedCanonicalizer(); for (final String[] pair : new String[][] { new String[] { "http://%41%5A%61%7A%30%39%2D%2E%5F%[email protected]/", "http://[email protected]/" }, new String[] { "http://:%41%5A%61%7A%30%39%2D%2E%5F%[email protected]/", "http://:[email protected]/" }, new String[] { "http://example.com/%41%5A%61%7A%30%39%2D%2E%5F%7E", "http://example.com/AZaz09-._~" }, new String[] { "http://example.com/?%41%5A%61%7A%30%39%2D%2E%5F%7E", "http://example.com/?AZaz09-._~" }, new String[] { "http://example.com/#%41%5A%61%7A%30%39%2D%2E%5F%7E", "http://example.com/#AZaz09-._~" } }) { assertThat(canon.canonicalize(URL.parse(pair[0])).toString()) .isEqualTo(URL.parse(pair[1]).toString()); } }
@Theory public void idempotence( final @TestURL.TestURLs(dataset = TestURL.DATASETS.WHATWG) TestURL testURL) throws GalimatiasParseException { assumeNotNull(testURL.parsedURL); final URLCanonicalizer canon = new DecodeUnreservedCanonicalizer(); final URL roundOne = canon.canonicalize(testURL.parsedURL); final URL roundTwo = canon.canonicalize(roundOne); assertThat(roundOne).isEqualTo(roundTwo); final URL reparse = URL.parse(roundTwo.toString()); assertThat(reparse).isEqualTo(roundTwo); }
/** * 请求 * * @date 2013-1-7 上午11:08:54 * @param toFetchURL * @return */ public FetchResult request(FetchRequest req) throws Exception { FetchResult fetchResult = new FetchResult(); HttpUriRequest request = null; HttpEntity entity = null; String toFetchURL = req.getUrl(); boolean isPost = false; try { if (Http.Method.GET.equalsIgnoreCase(req.getHttpMethod())) request = new HttpGet(toFetchURL); else if (Http.Method.POST.equalsIgnoreCase(req.getHttpMethod())) { request = new HttpPost(toFetchURL); isPost = true; } else if (Http.Method.PUT.equalsIgnoreCase(req.getHttpMethod())) request = new HttpPut(toFetchURL); else if (Http.Method.HEAD.equalsIgnoreCase(req.getHttpMethod())) request = new HttpHead(toFetchURL); else if (Http.Method.OPTIONS.equalsIgnoreCase(req.getHttpMethod())) request = new HttpOptions(toFetchURL); else if (Http.Method.DELETE.equalsIgnoreCase(req.getHttpMethod())) request = new HttpDelete(toFetchURL); else throw new Exception("Unknown http method name"); // 同步信号量,在真正对服务端进行访问之前进行访问间隔的控制 // TODO 针对每个请求有一个delay的参数设置 synchronized (mutex) { // 获取当前时间 long now = (new Date()).getTime(); // 对同一个Host抓取时间间隔进行控制,若在设置的时限内则进行休眠 if (now - lastFetchTime < config.getPolitenessDelay()) Thread.sleep(config.getPolitenessDelay() - (now - lastFetchTime)); // 不断更新最后的抓取时间,注意,是针对HOST的,不是针对某个URL的 lastFetchTime = (new Date()).getTime(); } // 设置请求GZIP压缩,注意,前面必须设置GZIP解压缩处理 request.addHeader("Accept-Encoding", "gzip"); for (Iterator<Entry<String, String>> it = headers.entrySet().iterator(); it.hasNext(); ) { Entry<String, String> entry = it.next(); request.addHeader(entry.getKey(), entry.getValue()); } // 记录请求信息 Header[] headers = request.getAllHeaders(); for (Header h : headers) { Map<String, List<String>> hs = req.getHeaders(); String key = h.getName(); List<String> val = hs.get(key); if (val == null) val = new ArrayList<String>(); val.add(h.getValue()); hs.put(key, val); } req.getCookies().putAll(this.cookies); fetchResult.setReq(req); HttpEntity reqEntity = null; if (Http.Method.POST.equalsIgnoreCase(req.getHttpMethod()) || Http.Method.PUT.equalsIgnoreCase(req.getHttpMethod())) { if (!req.getFiles().isEmpty()) { reqEntity = new MultipartEntity(HttpMultipartMode.BROWSER_COMPATIBLE); for (Iterator<Entry<String, List<File>>> it = req.getFiles().entrySet().iterator(); it.hasNext(); ) { Entry<String, List<File>> e = it.next(); String paramName = e.getKey(); for (File file : e.getValue()) { // For File parameters ((MultipartEntity) reqEntity).addPart(paramName, new FileBody(file)); } } for (Iterator<Entry<String, List<Object>>> it = req.getParams().entrySet().iterator(); it.hasNext(); ) { Entry<String, List<Object>> e = it.next(); String paramName = e.getKey(); for (Object paramValue : e.getValue()) { // For usual String parameters ((MultipartEntity) reqEntity) .addPart( paramName, new StringBody( String.valueOf(paramValue), "text/plain", Charset.forName("UTF-8"))); } } } else { List<NameValuePair> params = new ArrayList<NameValuePair>(req.getParams().size()); for (Iterator<Entry<String, List<Object>>> it = req.getParams().entrySet().iterator(); it.hasNext(); ) { Entry<String, List<Object>> e = it.next(); String paramName = e.getKey(); for (Object paramValue : e.getValue()) { params.add(new BasicNameValuePair(paramName, String.valueOf(paramValue))); } } reqEntity = new UrlEncodedFormEntity(params, HTTP.UTF_8); } if (isPost) ((HttpPost) request).setEntity(reqEntity); else ((HttpPut) request).setEntity(reqEntity); } // 执行请求,获取服务端返回内容 HttpResponse response = httpClient.execute(request); headers = response.getAllHeaders(); for (Header h : headers) { Map<String, List<String>> hs = fetchResult.getHeaders(); String key = h.getName(); List<String> val = hs.get(key); if (val == null) val = new ArrayList<String>(); val.add(h.getValue()); hs.put(key, val); } // 设置已访问URL fetchResult.setFetchedUrl(toFetchURL); String uri = request.getURI().toString(); if (!uri.equals(toFetchURL)) if (!URLCanonicalizer.getCanonicalURL(uri).equals(toFetchURL)) fetchResult.setFetchedUrl(uri); entity = response.getEntity(); // 服务端返回的状态码 int statusCode = response.getStatusLine().getStatusCode(); if (statusCode != HttpStatus.SC_OK) { if (statusCode != HttpStatus.SC_NOT_FOUND) { Header locationHeader = response.getFirstHeader("Location"); // 如果是301、302跳转,获取跳转URL即可返回 if (locationHeader != null && (statusCode == HttpStatus.SC_MOVED_PERMANENTLY || statusCode == HttpStatus.SC_MOVED_TEMPORARILY)) fetchResult.setMovedToUrl( URLCanonicalizer.getCanonicalURL(locationHeader.getValue(), toFetchURL)); } // 只要不是OK的除了设置跳转URL外设置statusCode即可返回 // 判断是否有忽略状态码的设置 if (this.site.getSkipStatusCode() != null && this.site.getSkipStatusCode().trim().length() > 0) { String[] scs = this.site.getSkipStatusCode().split(","); for (String code : scs) { int c = CommonUtil.toInt(code); // 忽略此状态码,依然解析entity if (statusCode == c) { assemPage(fetchResult, entity); break; } } } fetchResult.setStatusCode(statusCode); return fetchResult; } // 处理服务端返回的实体内容 if (entity != null) { fetchResult.setStatusCode(statusCode); assemPage(fetchResult, entity); return fetchResult; } } catch (Throwable e) { fetchResult.setFetchedUrl(e.toString()); fetchResult.setStatusCode(Status.INTERNAL_SERVER_ERROR.ordinal()); return fetchResult; } finally { try { if (entity == null && request != null) request.abort(); } catch (Exception e) { throw e; } } fetchResult.setStatusCode(Status.UNSPECIFIED_ERROR.ordinal()); return fetchResult; }
/** * 抓取目标url的内容 * * @date 2013-1-7 上午11:08:54 * @param toFetchURL * @return */ public FetchResult fetch(FetchRequest req) throws Exception { if (req.getHttpMethod() != null && !Http.Method.GET.equals(req.getHttpMethod())) { // 获取到URL后面的QueryParam String query = new URL(req.getUrl()).getQuery(); for (String q : query.split("\\&")) { String[] qv = q.split("="); String name = qv[0]; String val = qv[1]; List<Object> vals = req.getParams().get(name); if (vals == null) { vals = new ArrayList<Object>(); req.getParams().put(name, vals); } vals.add(val); } return request(req); } FetchResult fetchResult = new FetchResult(); HttpGet get = null; HttpEntity entity = null; String toFetchURL = req.getUrl(); try { get = new HttpGet(toFetchURL); // 设置请求GZIP压缩,注意,前面必须设置GZIP解压缩处理 get.addHeader("Accept-Encoding", "gzip"); for (Iterator<Entry<String, String>> it = headers.entrySet().iterator(); it.hasNext(); ) { Entry<String, String> entry = it.next(); get.addHeader(entry.getKey(), entry.getValue()); } // 同步信号量,在真正对服务端进行访问之前进行访问间隔的控制 // TODO 针对每个请求有一个delay的参数设置 synchronized (mutex) { // 获取当前时间 long now = (new Date()).getTime(); // 对同一个Host抓取时间间隔进行控制,若在设置的时限内则进行休眠 if (now - lastFetchTime < config.getPolitenessDelay()) Thread.sleep(config.getPolitenessDelay() - (now - lastFetchTime)); // 不断更新最后的抓取时间,注意,是针对HOST的,不是针对某个URL的 lastFetchTime = (new Date()).getTime(); } // 记录get请求信息 Header[] headers = get.getAllHeaders(); for (Header h : headers) { Map<String, List<String>> hs = req.getHeaders(); String key = h.getName(); List<String> val = hs.get(key); if (val == null) val = new ArrayList<String>(); val.add(h.getValue()); hs.put(key, val); } req.getCookies().putAll(this.cookies); fetchResult.setReq(req); // 执行get访问,获取服务端返回内容 HttpResponse response = httpClient.execute(get); headers = response.getAllHeaders(); for (Header h : headers) { Map<String, List<String>> hs = fetchResult.getHeaders(); String key = h.getName(); List<String> val = hs.get(key); if (val == null) val = new ArrayList<String>(); val.add(h.getValue()); hs.put(key, val); } // 设置已访问URL fetchResult.setFetchedUrl(toFetchURL); String uri = get.getURI().toString(); if (!uri.equals(toFetchURL)) if (!URLCanonicalizer.getCanonicalURL(uri).equals(toFetchURL)) fetchResult.setFetchedUrl(uri); entity = response.getEntity(); // 服务端返回的状态码 int statusCode = response.getStatusLine().getStatusCode(); if (statusCode != HttpStatus.SC_OK) { if (statusCode != HttpStatus.SC_NOT_FOUND) { Header locationHeader = response.getFirstHeader("Location"); // 如果是301、302跳转,获取跳转URL即可返回 if (locationHeader != null && (statusCode == HttpStatus.SC_MOVED_PERMANENTLY || statusCode == HttpStatus.SC_MOVED_TEMPORARILY)) fetchResult.setMovedToUrl( URLCanonicalizer.getCanonicalURL(locationHeader.getValue(), toFetchURL)); } // 只要不是OK的除了设置跳转URL外设置statusCode即可返回 // 判断是否有忽略状态码的设置 if (this.site.getSkipStatusCode() != null && this.site.getSkipStatusCode().trim().length() > 0) { String[] scs = this.site.getSkipStatusCode().split(","); for (String code : scs) { int c = CommonUtil.toInt(code); // 忽略此状态码,依然解析entity if (statusCode == c) { assemPage(fetchResult, entity); break; } } } fetchResult.setStatusCode(statusCode); return fetchResult; } // 处理服务端返回的实体内容 if (entity != null) { fetchResult.setStatusCode(statusCode); assemPage(fetchResult, entity); return fetchResult; } } catch (Throwable e) { fetchResult.setFetchedUrl(e.toString()); fetchResult.setStatusCode(Status.INTERNAL_SERVER_ERROR.ordinal()); return fetchResult; } finally { try { if (entity == null && get != null) get.abort(); } catch (Exception e) { throw e; } } fetchResult.setStatusCode(Status.UNSPECIFIED_ERROR.ordinal()); return fetchResult; }