public static Document getDocument(LocalDate date, Earning.EARNINGS_TYPE earningType) throws Exception { Thread.sleep(2000); String actualURLRequest = getYahooURL(earningType, date); Connection calConnectionObj = null; Response calResponse = null; int calRespAttempts = 0; int maxAttempt = 5; if (isWeekend(date)) { maxAttempt = 2; } while (calRespAttempts <= maxAttempt && calResponse == null) { calRespAttempts++; try { calConnectionObj = Jsoup.connect(actualURLRequest); calResponse = calConnectionObj.execute(); } catch (Exception e1) { Thread.sleep(4000); } } if (calResponse != null && isValidConnection(calResponse)) { return calConnectionObj.get(); } else { if (calResponse != null) { throw new Exception( "Invalid HTTP Status Code:" + calResponse.statusCode() + ";" + actualURLRequest); } else { throw new Exception("The connection response was null;" + actualURLRequest); } } }
/** Implementation method */ public List<CCCAMEntity> getLines() { List<CCCAMEntity> clines = new ArrayList<CCCAMEntity>(); try { Response res = Jsoup.connect(BASE_URL) .data("user", "RDS580" + System.currentTimeMillis()) .data("pass", "RDS580") .data("submit", "Activate!") .userAgent( "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0") .referrer(BASE_URL) .method(Method.POST) .execute(); final String linesweb = res.body(); String lineSearch1 = "C: "; String lineSearch2 = " :|: and it will "; String line = linesweb.substring( linesweb.indexOf(lineSearch1) + lineSearch1.length(), linesweb.indexOf(lineSearch2, linesweb.indexOf(lineSearch1) + lineSearch1.length())); line = line.trim(); final String[] tokens = line.split(" "); final String host = tokens[0].trim(); final String port = tokens[1].trim(); final String user = tokens[2].trim(); final String pass = tokens[3].trim(); clines.add(new CCCAMEntity(host, port, user, pass, default_hops)); } catch (Exception e) { System.out.println("Error en " + BASE_URL + ". " + e.getMessage()); } return clines; }
public boolean getCaptchaImgAndCookies(int times) { captchaCookies.clear(); if (times > maxRecursiveTimes) return false; Connection con = JsoupUtil.getResourceCon( "https://www.zhihu.com/captcha.gif?r=" + System.currentTimeMillis() + "&type=login"); Response rs = null; try { rs = con.execute(); } catch (IOException e) { e.printStackTrace(); log.info("获取验证码第" + times + "次失败"); return getCaptchaImgAndCookies(++times); } File file = new File(EzraPoundUtil.CAPTCHA_DIR); try { FileOutputStream out = (new FileOutputStream(file)); out.write(rs.bodyAsBytes()); } catch (IOException e) { e.printStackTrace(); } captchaCookies.putAll(rs.cookies()); log.info("验证码已保存" + ",路径为:" + file.getAbsolutePath()); log.info("验证码对应cookie为:" + captchaCookies); return true; }
/* * I haven't found a direct way of extracting the download URL of a Mixcloud track. * Mixcloud's track preview URLs and full download URLs are similar. The preview URL for * a Mixcloud track is simple to extract. * * This method replaces the "previews" part of the preview URL with "cloudcasts/originals" and then * cycles through all of Mixcloud's stream servers until the download URL is found. * * Similarity between Mixcloud preview URL and full download URL: * http://stream8.mxcdn.com/previews/9/6/a/e/93a8-2d77-4573-85c5-68bfb679d9bc.mp3 - preview URL * http://stream11.mxcdn.com/cloudcasts/originals/9/6/a/e/93a8-2d77-4573-85c5-68bfb679d9bc.mp3 - download URL */ private String generateStreamURL() throws IOException { String downloadUrl = this.getPreviewURL().replaceAll("previews", "cloudcasts/originals"); try { @SuppressWarnings("unused") Response res = Jsoup.connect(downloadUrl) .ignoreContentType(true) .userAgent("Mozilla/5.0 (Windows NT 6.1; rv:17.0) Gecko/20100101 Firefox/17.0") .execute(); return downloadUrl; } catch (HttpStatusException firstAttempt) { int serversToCycle = 30; for (int i = 1; i <= serversToCycle; ) { try { String cycledUrl = downloadUrl.replaceAll("stream[0-9]+", ("stream" + i)); Response res = Jsoup.connect(cycledUrl) .ignoreContentType(true) .userAgent("Mozilla/5.0 (Windows NT 6.1; rv:17.0) Gecko/20100101 Firefox/17.0") .execute(); if (res.parse().toString().length() < 2000) i++; else return cycledUrl; } catch (HttpStatusException cycledAttempt) { i++; } } } return null; }
/** * 下载验证码图片并保存到手机内部空间<br /> * 下载完后会发送一条广播给《一键签到》主程序,让其显示验证码输入框 * @param captchaUrl * @param ua * @param cookies * @param siteName * @param user * @param reason * @return */ private static boolean downloadCaptchaPic(String captchaUrl, String ua, HashMap<String, String> cookies, String siteName, String user, String reason) { Response res; boolean isSucceed = false; for(int i=0;i<RETRY_TIMES;i++) { try { res = Jsoup.connect(captchaUrl).cookies(cookies).userAgent(ua).timeout(TIME_OUT).ignoreContentType(true).referrer(captchaUrl).method(Method.GET).execute(); cookies.putAll(res.cookies()); try { deleteCaptchaFile();//删除遗留的验证码 saveCaptchaToFile(res.bodyAsBytes());//保存验证码到文件 sendShowCaptchaDialogBC(siteName, user, reason);//给《一键签到》主程序发送广播,让其显示验证码 isSucceed = true; pauseThread();//用线程锁暂停签到线程,如果按下了验证码输入窗口的“确定”或“取消”,程序会对签到线程进行解锁 break;//跳出重试 } catch (Exception e) { //保存验证码到文件失败 isSucceed = false; e.printStackTrace(); } } catch (IOException e) { //拉取验证码失败 isSucceed = false; e.printStackTrace(); } } return isSucceed; }
/** * Parse cn page and write in hbase * * @param symbol */ public static void parseCNSymbols(String symbol) { if (!Hbase.getData(symbol).equals("")) { // System.out.println(symbol + " Exists!"); return; } String url = "http://xueqiu.com/S/" + symbol + "/historical.csv"; Response rs = null; // System.out.println(url); try { Connection con = getConnection(url, "historyHttp"); con.header("Referer", " http://xueqiu.com/S/" + symbol); rs = con.execute(); // System.out.println(rs.body()); } catch (IOException e1) { if (handleError) { System.out.println(symbol + " http error"); errors.add(symbol); } else { WriteError(symbol); System.out.println(symbol + " http error"); } return; } try { BufferedReader reader = new BufferedReader(new StringReader(rs.body())); // 换成你的文件名 reader.readLine(); // 第一行信息,为标题信息,不用,如果需要,注释掉 String line = null; JSONArray HistoricalData = new JSONArray(); List<JSONArray> jsonLists = new ArrayList<JSONArray>(); while ((line = reader.readLine()) != null) { String item[] = line.split(","); // CSV格式文件为逗号分隔符文件,这里根据逗号切分 // System.out.println(item[0]); JSONArray DailyData = new JSONArray(); for (int i = 1; i < item.length; i++) { item[i] = item[i].replace("\"", ""); DailyData.put(item[i]); } if (Double.valueOf(DailyData.getString(2)) != 0) { jsonLists.add(DailyData); } } for (int i = (jsonLists.size() - 1); i >= 0; i--) { HistoricalData.put(jsonLists.get(i)); } Hbase.addData(symbol, type, HistoricalData.toString()); // System.out.println(symbol + " done"); // System.out.println(jsonLists); } catch (Exception e) { if (handleError) { System.out.println(symbol + " parsing error"); errors.add(symbol); } else { WriteError(symbol); System.out.println(symbol + " parsing error"); } } }
private AccessToken getAccessToken() { try { Response response = Jsoup.connect(GET_URL).ignoreContentType(true).method(Method.GET).execute(); String json = response.body(); Gson gson = new Gson(); return gson.fromJson(json, new TypeToken<AccessToken>() {}.getType()); } catch (Exception e) { log.error("[GetAccessTokenTask] getAccessToken Error:", e); return null; } }
/** * 取得本次连接的sessionID * * @return */ private String getSessionID() { Response response = null; for (int i = 0; i < tryTimes; i++) { try { response = Jsoup.connect(HOME_PAGE).userAgent(userAgent).timeout(timeout).execute(); } catch (IOException e) { // ignore } if (response != null) { break; } } return response.cookie("PHPSESSID"); }
public boolean loginBySavedCookies() { loginCookies.clear(); readCookies(EzraPoundUtil.LOGIN_COOKIES_DIR, loginCookies); Connection con = JsoupUtil.getGetCon("https://www.zhihu.com"); Response rs = null; try { rs = con.cookies(loginCookies).execute(); } catch (IOException e) { e.printStackTrace(); log.info("携带cookie登录测试失败"); return false; } return checkLogin(Jsoup.parse(rs.body())); }
private String getContent(Class response, String... strings) throws Exception { Map<String, String> data = new HashMap<String, String>(); for (int i = 0; i < strings.length; i = i + 2) data.put(strings[i], strings[i + 1]); Response res = null; System.out.println("Start connect: " + response.getName() + ", " + new Date()); while (res == null) { try { res = Jsoup.connect(url).data(data).timeout(0).method(Method.POST).execute(); } catch (IOException e) { System.out.println("exception: " + response.getName() + ", " + new Date()); } } System.out.println("Finish connect: " + response.getName() + ", " + new Date()); return changeEncoding(res.body()); }
public static boolean isValidConnection(Response response) { boolean isValidConnection = false; if (response.statusCode() == 200) { isValidConnection = true; } return isValidConnection; }
public boolean getXsrf(int times) { if (times > maxRecursiveTimes) return false; Connection con = JsoupUtil.getGetCon("http://www.zhihu.com"); Response rs = null; try { rs = con.execute(); } catch (IOException e) { e.printStackTrace(); log.info("获取_xsrf第" + times + "次失败"); return getXsrf(++times); } Document doc = Jsoup.parse(rs.body()); xsrf = doc.select(".view.view-signin [name=\"_xsrf\"]").attr("value"); log.info("已获得xsrf:" + xsrf); return true; }
/** Implementation method */ public List<CCCAMEntity> getLines() { List<CCCAMEntity> clines = new ArrayList<CCCAMEntity>(); try { Response res = Jsoup.connect(BASE_URL) .timeout(7500) .userAgent( "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0") .method(Method.GET) .execute(); Document doc = Jsoup.parse(res.body()); final String[] tokens = doc.getElementsByTag("h1").get(0).text().split(" "); final String host = tokens[1].trim(); final String port = tokens[2].trim(); final String user = tokens[3].trim(); final String pass = tokens[4].trim(); clines.add(new CCCAMEntity(host, port, user, pass, default_hops)); } catch (Exception e) { System.out.println("Error en " + BASE_URL); System.out.println("Error: " + e.getMessage()); } return clines; }
public static void main(String[] args) throws Exception { int index = 0; long timeout = 5000; String pattern = "MMdd_HH_mm_ss"; String dataDir = "./data/" + YhdPriceMonitor.class.getSimpleName().toLowerCase(); disableSSLCertCheck(); String dateChars = DateFormatUtils.format(new Date(), pattern); int lastErrorCount = 0; while (true) { index++; String url = "http://gps.yhd.com/restful/detail?mcsite=1&provinceId=1&pmId=41909728&callback=jQuery111304328004347221549_1447325832073&_=" + System.currentTimeMillis(); long start = System.currentTimeMillis(); Response resp = doRequest(url, 3); long cost = System.currentTimeMillis() - start; if (resp == null) { log.warn("error,index:" + index + ",status:null,cost:" + cost + ",url:" + url); } else { log.info( "done,index:" + index + ",status:" + resp.statusCode() + ",cost:" + cost + ",url:" + url); } boolean success = doValidateResponse(resp); if (success) { lastErrorCount = 0; log.info("validate=true.index:" + index + ",cookies:" + JSON.toJSONString(resp.cookies())); log.info("validate=true.index:" + index + ",headers:" + JSON.toJSONString(resp.headers())); } else { lastErrorCount++; if (resp != null) { log.warn( "validate=false.index:" + index + ",cookies:" + JSON.toJSONString(resp.cookies())); log.warn( "validate=false.index:" + index + ",cookies:" + JSON.toJSONString(resp.headers())); } } FileUtils.writeStringToFile( new File(dataDir, dateChars + File.separator + index + ".html"), resp == null ? "no response" : resp.body()); if (lastErrorCount >= 10) { break; } TimeUnit.MILLISECONDS.sleep(timeout); } }
private static boolean doValidateResponse(Response resp) { if (resp == null) { return false; } String body = resp.body(); List<String> containList = new ArrayList<String>(); containList.add("marketPrice"); containList.add("89"); containList.add("41909728"); for (String sVal : containList) { if (body.indexOf(sVal) < 0) { return false; } } return true; }
public static Map<String, String> login( HttpServletRequest req, HttpServletResponse res, String username, String password) throws Exception { String url = ReadProperties.getByName("login.ip") + "/login"; Map<String, String> datas = new HashMap<String, String>(); Map<String, String> cookies = new HashMap<String, String>(); Connection con = Jsoup.connect(url).timeout(120000); // 获取连接 con.header( "User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:29.0) Gecko/20100101 Firefox/29.0"); // 配置模拟浏览器 Response rs; rs = con.execute(); cookies = rs.cookies(); Document doc = Jsoup.parse(rs.body()); // 转换为Dom树 List<Element> et = doc.select("form"); // 获取form表单,可以通过查看页面源码代码得知 for (Element e : et.get(0).getAllElements()) { if (e.attr("name").equals("username")) { e.attr("value", username); // 设置用户名 } if (e.attr("name").equals("password")) { e.attr("value", password); // 设置用户密码 } if (e.attr("name").length() > 0) { // 排除空值表单属性 datas.put(e.attr("name"), e.attr("value")); } } // 设置cookie和post上面的map数据 Response login = null; login = con.data(datas).cookies(cookies).method(Method.POST).execute(); url = ReadProperties.getByName("common.ip") + req.getContextPath() + "/user/getUser"; con = Jsoup.connect(url) .cookies(login.cookies()) .ignoreContentType(true) .method(Method.GET); // 获取连接 rs = con.execute(); for (Entry<String, String> entry : rs.cookies().entrySet()) { Cookie cookie = new Cookie(entry.getKey(), entry.getValue()); cookie.setPath(req.getContextPath() + "/"); res.addCookie(cookie); } return JsonUtil.jsonToObject(rs.body(), Map.class); }
public boolean loginByEmailAndPwd() { loginCookies.clear(); Scanner sc = new Scanner(System.in); getCaptchaImgAndCookies(0); log.info("请输入账号:"); email = sc.nextLine(); log.info("请输入密码"); password = sc.nextLine(); log.info("查看验证码并输入"); captcha = sc.nextLine(); Connection con = JsoupUtil.getPostCon("https://www.zhihu.com/login/email"); Response rs = null; try { rs = con.data("_xsrf", xsrf) .data("email", email) .data("password", password) .data("remember_me", remeberMe) .data("captcha", captcha) .cookies(captchaCookies) .ignoreContentType(true) .execute(); } catch (IOException e) { e.printStackTrace(); log.info("通过账号密码登录发生异常"); return false; } JSONObject jsonObject = new JSONObject(rs.body()); String result = jsonObject.get("r").toString(); log.info(EzraPoundUtil.unicode2Character(jsonObject.get("msg").toString())); Response rs2 = null; try { rs2 = JsoupUtil.getGetCon("https://www.zhihu.com").cookies(rs.cookies()).execute(); } catch (IOException e) { e.printStackTrace(); } if (checkLogin(Jsoup.parse(rs2.body()))) { loginCookies.putAll(rs.cookies()); saveCookies(EzraPoundUtil.LOGIN_COOKIES_DIR, loginCookies); return true; } return false; }
private void login(String[] captchaData) throws InvalidCredentialsException, ConnectionException, ParseException { final Response loginResponse = postToLogin(username, password, captchaData); this.cookies = new HashMap<>(loginResponse.cookies()); Document loginResponseDocument; try { loginResponseDocument = loginResponse.parse(); } catch (IOException e) { throw new ParseException("While parsing the login response", e); } Elements inputs = loginResponseDocument.select("input[name=skypetoken]"); if (inputs.size() > 0) { this.setSkypeToken(inputs.get(0).attr("value")); HttpURLConnection asmResponse = getAsmToken(); String[] setCookie = asmResponse.getHeaderField("Set-Cookie").split(";")[0].split("="); this.cookies.put(setCookie[0], setCookie[1]); registerEndpoint(); this.loadAllContacts(); this.getContactRequests(false); try { this.registerWebSocket(); } catch (Exception e) { throw new RuntimeException(e); } loggedIn.set(true); (sessionKeepaliveThread = new KeepaliveThread(this)).start(); (reauthThread = new AuthenticationChecker(this)).start(); } else { boolean foundError = false; Elements captchas = loginResponseDocument.select("#captchaContainer"); if (captchas.size() > 0) { Element captcha = captchas.get(0); String url = null; for (Element scriptTag : captcha.getElementsByTag("script")) { String text = scriptTag.html(); if (text.contains("skypeHipUrl")) { url = text.substring(text.indexOf('"') + 1, text.lastIndexOf('"')); } } if (url != null) { try { String rawjs = Endpoints.custom(url, this).as(String.class).get(); Pattern p = Pattern.compile("imageurl:'([^']*)'"); Matcher m = p.matcher(rawjs); if (m.find()) { String imgurl = m.group(1); m = Pattern.compile("hid=([^&]*)").matcher(imgurl); if (m.find()) { String hid = m.group(1); m = Pattern.compile("fid=([^&]*)").matcher(imgurl); if (m.find()) { String fid = m.group(1); CaptchaEvent event = new CaptchaEvent(imgurl); getEventDispatcher().callEvent(event); String response = event.getCaptcha(); if (response != null) { login(new String[] {response, hid, fid}); } else { throw new CaptchaException(); } foundError = true; } } } } catch (ConnectionException e) { MinorErrorEvent err = new MinorErrorEvent(MinorErrorEvent.ErrorSource.PARSING_CAPTCHA, e); getEventDispatcher().callEvent(err); } } } if (!foundError) { Elements elements = loginResponseDocument.select(".message_error"); if (elements.size() > 0) { Element div = elements.get(0); if (div.children().size() > 1) { Element span = div.child(1); throw new InvalidCredentialsException(span.text()); } } else { throw new InvalidCredentialsException( "Could not find error message. Dumping entire page. \n" + loginResponseDocument.html()); } } } }
public int crawl(int ok, int fail, String url, boolean isShort) { if (!ua.equals("") && ua != null) UA = ua; try { Document doc; if (isShort) { Response resp = Jsoup.connect(url).userAgent(UA).followRedirects(true).execute(); doc = resp.parse(); } else { doc = Jsoup.connect(url).userAgent(UA).timeout(10000).get(); } resultTitle = resultCont = ""; /*MLog.e("","title_rex="+title_rex); MLog.e("","cont_rex="+cont_rex); MLog.e("","auth_rex="+auth_rex); MLog.e("","extra_rex="+extra_rex); MLog.e("","source="+source+" url="+url);*/ if (cont_rex.contains(" ")) { String ctemp = cont_rex.trim(); String[] cgp = ctemp.split(" "); if (cgp[1].equals("all")) { cont_len = -1; cont_rex = cgp[0]; } } Elements eletitle = doc.select(this.title_rex), eleauth = null, elecont = doc.select(this.cont_rex), eleextra = null; if (Constant.DEBUG) FileUtils.writeFile(doc.html(), "clip"); if (!auth_rex.equals("")) eleauth = doc.select(this.auth_rex); if (!extra_rex.equals("")) eleextra = doc.select(this.extra_rex); if (eletitle.size() > 0) { resultTitle = eletitle.get(0).html(); if (elecont.size() > 0) { elecont = addStyleForTable(elecont); if (cont_len == -1) { for (Element ele : elecont) { resultCont = resultCont + ele.html(); } } else resultCont = elecont.get(0).html(); } if (!auth_rex.equals("")) { if (eleauth.size() > 0) resultCont = "<p>" + eleauth.get(0).html() + "</p>" + resultCont; } if (!extra_rex.equals("")) { eleextra = addStyleForTable(eleextra); if (eleextra.size() > 0) resultCont = resultCont + eleextra.get(0).html(); } return ok; } else { MLog.e("", "没有匹配到title"); return fail; } } catch (IOException e) { // TODO Auto-generated catch block MLog.e("", "没有请求到数据"); return fail; } }
public Parser() { String url = "http://blogipolku.fi/selaa/blogeja"; String title; String desc; String divc; File file = new File("/users/user/BlogPolku.txt"); if (!file.exists()) { try { file.createNewFile(); } catch (IOException e) { e.printStackTrace(); } } FileWriter fw = null; try { fw = new FileWriter(file.getAbsoluteFile()); } catch (IOException e1) { e1.printStackTrace(); } BufferedWriter bw = new BufferedWriter(fw); try { // Connect to the web site Document document = Jsoup.connect(url).get(); // Using Elements to get the Meta data Elements description = document.select("meta[name=description]"); List<String> linkArray2 = new ArrayList<>(); List<String> linkArray = new ArrayList<>(); Elements links = document.select("a[class=blog-card__title-link]"); // tekee nyt 100 sivun verran. Pitäisi vaihtaa tunnistamaan loppu - (disabled arrow) for (int page = 1; page <= 101; page++) { document = Jsoup.connect(url + "?page=" + page).timeout(120 * 1000).get(); System.out.println("changing page document: " + url + "?page=" + page); linkArray = new ArrayList<>(); links = document.select("a[class=blog-card__title-link]"); for (int i = 0; i < links.size(); i++) { linkArray2 = new ArrayList<>(); try { Thread.sleep(100); } catch (InterruptedException e) { e.printStackTrace(); } linkArray.add(links.get(i).attr("href")); System.out.println("Connecting..."); document = Jsoup.connect("http://blogipolku.fi" + linkArray.get(i)).timeout(600 * 1000).get(); System.out.println("changing document: " + "http://blogipolku.fi" + linkArray.get(i)); Elements links2 = document.select("a[class=blog-view__title-link]"); // linkArray2.add("http://blogipolku.fi" + links2.get(0).attr("href")); linkArray2.add(links2.get(0).attr("href")); System.out.println("Response Connecting to link:" + linkArray2.get(0) + " ..."); // linkArray2.add("http://blogipolku.fi" + links2.get(u).attr("href")); try { Response response = Jsoup.connect(linkArray2.get(0)) .followRedirects(true) .ignoreHttpErrors(true) .execute(); bw.write(response.url() + System.getProperty("line.separator")); } catch (Exception e) { continue; } // Elements links2 = document.select("a[class=post-card__title-link]"); // System.out.println("links 2 size" + links2.size()); // // bw.write("Page: " + page + System.getProperty("line.separator")); // for (int u = 0; u < links2.size(); u++) { // // System.out.println("Response Connecting to link:" + u + " ..."); // linkArray2.add("http://blogipolku.fi" + links2.get(u).attr("href")); // try{ // Response response = // Jsoup.connect(linkArray2.get(u)).followRedirects(true).ignoreHttpErrors(true).execute(); // // bw.write(response.url() + System.getProperty("line.separator")); // } catch(Exception e){ // continue; // } // } } } bw.close(); } catch (IOException e) { e.printStackTrace(); } }