public static String getCSSSource(String htmlSource) throws Exception { Document htmlDocument = Jsoup.parse(htmlSource); Elements elements = htmlDocument.select("link[type=text/css]"); StringBuilder sb = new StringBuilder(); for (Element element : elements) { String href = element.attr("href"); if (!href.contains(PropsValues.PORTAL_URL)) { href = PropsValues.PORTAL_URL + href; } Connection connection = Jsoup.connect(href); Document document = connection.get(); sb.append(document.text()); sb.append("\n"); } return sb.toString(); }
public static String requestFee(String url) { boolean finish = false; String fee = ""; do { try { Connection conn = Jsoup.connect(url); Document doc = conn.timeout(5000).get(); if (doc.select("#block-system-main > table > tbody > tr:nth-child(2) > td:nth-child(4)") .size() > 0) { fee = getFee( doc.select( "#block-system-main > table > tbody > tr:nth-child(2) > td:nth-child(4)") .text()); } else { fee = url; } finish = true; } catch (IOException e) { System.out.println("requestFee : " + e.getMessage()); } } while (!finish); return fee; }
public static void initMajorList(String originalUrl) { System.out.println("preparing majorList"); boolean finish = false; do { try { majorList.clear(); Connection conn = Jsoup.connect(originalUrl); Document doc = conn.timeout(10000).get(); Elements es = doc.select("#accordion__target-3 > div.course-listing__box > a"); for (Element e : es) { // major MajorForCollection major = new MajorForCollection(); major.setLevel(LEVEL); major.setTitle(e.select("h3").get(0).text().trim()); major.setType(e.select("p").get(0).text().replaceAll("-[\\s\\S]*", "").trim()); major.setUrl(e.select("a").get(0).attr("href")); majorList.add(major); } ; finish = true; } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } } while (!finish); System.out.println("majorList prepared"); System.out.println("majorList size: " + majorList.size()); }
public static Document getDocument(LocalDate date, Earning.EARNINGS_TYPE earningType) throws Exception { Thread.sleep(2000); String actualURLRequest = getYahooURL(earningType, date); Connection calConnectionObj = null; Response calResponse = null; int calRespAttempts = 0; int maxAttempt = 5; if (isWeekend(date)) { maxAttempt = 2; } while (calRespAttempts <= maxAttempt && calResponse == null) { calRespAttempts++; try { calConnectionObj = Jsoup.connect(actualURLRequest); calResponse = calConnectionObj.execute(); } catch (Exception e1) { Thread.sleep(4000); } } if (calResponse != null && isValidConnection(calResponse)) { return calConnectionObj.get(); } else { if (calResponse != null) { throw new Exception( "Invalid HTTP Status Code:" + calResponse.statusCode() + ";" + actualURLRequest); } else { throw new Exception("The connection response was null;" + actualURLRequest); } } }
public static String getType(Document doc) { String type = ""; if (doc.select("#kw").size() > 0) { Element e = doc.select("#kw").get(0); StringBuilder typeURL = new StringBuilder(); typeURL.append("http://widget.unistats.ac.uk/Widget/"); typeURL.append(e.attr("data-institution") + "/"); typeURL.append(e.attr("data-course") + "/"); typeURL.append(e.attr("data-orientation") + "/"); typeURL.append("null/"); typeURL.append(e.attr("data-language") + "/"); typeURL.append(e.attr("data-kismode")); boolean finishe = false; try { do { Connection tmpConn = Jsoup.connect(typeURL.toString()); Document tmpDoc = tmpConn.timeout(10000).get(); if (tmpDoc.select("#kisWidget > div.widgetCourse > h1").size() > 0) { e = tmpDoc.select("#kisWidget > div.widgetCourse > h1").get(0); type = e.text().trim().indexOf(" ") > 0 ? e.text().trim().substring(0, e.text().trim().indexOf(" ")) : e.text().trim(); } finishe = true; } while (!finishe); } catch (Exception ex) { ex.printStackTrace(); } } return type; }
public static String extractContent(String url) { try { Connection connection = Jsoup.connect(url); connection.userAgent(USER_AGENT); connection.followRedirects(true); connection.timeout(GET_TIMEOUT); long start = System.currentTimeMillis(); Connection.Response response = connection.execute(); long diff = System.currentTimeMillis() - start; int responseCode = response.statusCode(); if (response.statusCode() == OK) { String body = response.body(); Logger.info( "%s retrieved, content length %d, time %s sec.", url, body.length(), FormatUtil.millis2Seconds(diff)); return response.body(); } else { Logger.error("%s returned %d", url, responseCode); return ""; } } catch (IOException e) { Logger.error(e, "%s cannot be read.", url); return ""; } }
public boolean getCaptchaImgAndCookies(int times) { captchaCookies.clear(); if (times > maxRecursiveTimes) return false; Connection con = JsoupUtil.getResourceCon( "https://www.zhihu.com/captcha.gif?r=" + System.currentTimeMillis() + "&type=login"); Response rs = null; try { rs = con.execute(); } catch (IOException e) { e.printStackTrace(); log.info("获取验证码第" + times + "次失败"); return getCaptchaImgAndCookies(++times); } File file = new File(EzraPoundUtil.CAPTCHA_DIR); try { FileOutputStream out = (new FileOutputStream(file)); out.write(rs.bodyAsBytes()); } catch (IOException e) { e.printStackTrace(); } captchaCookies.putAll(rs.cookies()); log.info("验证码已保存" + ",路径为:" + file.getAbsolutePath()); log.info("验证码对应cookie为:" + captchaCookies); return true; }
public boolean crawl(String url) { // Give it a URL and it makes an HTTP try { Connection connection = Jsoup.connect(url).userAgent(USER_AGENT); Document htmlDocument = connection.get(); this.htmlDocument = htmlDocument; if (connection.response().statusCode() == 200) // 200 is the HTTP OK // status code { System.out.println("\n**Visiting** Received web page at " + url); } if (!connection.response().contentType().contains("text/html")) { System.out.println("**Failure** Retrieved something other that HTML"); return false; } Elements linksOnPage = htmlDocument.select("a[href]"); System.out.println("Found (" + linksOnPage.size() + "),links"); for (Element link : linksOnPage) { this.links.add(link.absUrl("href")); } return true; } catch (IOException ioe) { // We were not successful in out http request // System.out.println("Error in our HTTP request "+ioe); return false; } }
/** * Parse cn page and write in hbase * * @param symbol */ public static void parseCNSymbols(String symbol) { if (!Hbase.getData(symbol).equals("")) { // System.out.println(symbol + " Exists!"); return; } String url = "http://xueqiu.com/S/" + symbol + "/historical.csv"; Response rs = null; // System.out.println(url); try { Connection con = getConnection(url, "historyHttp"); con.header("Referer", " http://xueqiu.com/S/" + symbol); rs = con.execute(); // System.out.println(rs.body()); } catch (IOException e1) { if (handleError) { System.out.println(symbol + " http error"); errors.add(symbol); } else { WriteError(symbol); System.out.println(symbol + " http error"); } return; } try { BufferedReader reader = new BufferedReader(new StringReader(rs.body())); // 换成你的文件名 reader.readLine(); // 第一行信息,为标题信息,不用,如果需要,注释掉 String line = null; JSONArray HistoricalData = new JSONArray(); List<JSONArray> jsonLists = new ArrayList<JSONArray>(); while ((line = reader.readLine()) != null) { String item[] = line.split(","); // CSV格式文件为逗号分隔符文件,这里根据逗号切分 // System.out.println(item[0]); JSONArray DailyData = new JSONArray(); for (int i = 1; i < item.length; i++) { item[i] = item[i].replace("\"", ""); DailyData.put(item[i]); } if (Double.valueOf(DailyData.getString(2)) != 0) { jsonLists.add(DailyData); } } for (int i = (jsonLists.size() - 1); i >= 0; i--) { HistoricalData.put(jsonLists.get(i)); } Hbase.addData(symbol, type, HistoricalData.toString()); // System.out.println(symbol + " done"); // System.out.println(jsonLists); } catch (Exception e) { if (handleError) { System.out.println(symbol + " parsing error"); errors.add(symbol); } else { WriteError(symbol); System.out.println(symbol + " parsing error"); } } }
private <T> List<T> processTable(String url, LineProcessor<T> processor) throws Exception { List<T> tarifas = new ArrayList<>(); Connection con = DomUtils.get(url); Document doc = con.get(); Elements lines = doc.getElementsByTag("tbody").get(0).getElementsByTag("tr"); for (Element line : lines) { tarifas.add(processor.processLine(line)); } return tarifas; }
@Test public void test03() throws Exception { Connection connect = Jsoup.connect("http://www.jb51.net/article/16829.htm"); Connection.Response execute = connect.execute(); String body = execute.body(); System.out.println(body); Document parse = Jsoup.parse(body); Elements elements = parse.select("[src$=.gif]"); for (Element element : elements) { System.out.println(element.attr("src")); } }
/** * 点赞 * * @param statusId * @param like * @param cookie * @return * @throws TaskException */ public LikeResultBean doLike(String statusId, boolean like, String cookie) throws TaskException { try { String url = like ? "http://m.weibo.cn/attitudesDeal/add" : "http://m.weibo.cn/attitudesDeal/delete"; Map<String, String> cookieMap = new HashMap<String, String>(); String[] cookieValues = cookie.split(";"); for (String cookieValue : cookieValues) { String key = cookieValue.split("=")[0]; String value = cookieValue.split("=")[1]; cookieMap.put(key, value); } // Logger.d(WeiboClientActivity.TAG, cookieMap); Connection connection = Jsoup.connect(url); connection .userAgent( "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:33.0) Gecko/20100101 Firefox/33.0") .referrer("http://m.weibo.cn/") .cookies(cookieMap) .data("id", statusId) .method(Connection.Method.POST); if (like) connection.data("attitude", "heart"); String body = connection.execute().body(); if (!TextUtils.isEmpty(body)) { Logger.d(TAG, body); if (body.indexOf("http://passport.weibo.cn/sso/crossdomain") != -1) throw new TaskException("-100", "未登录"); else if (body.indexOf("<html") != -1) throw new TaskException("-100", "未登录"); LikeResultBean likeBean = JSON.parseObject(body, LikeResultBean.class); if (likeBean.getOk() == 1) { return likeBean; } else if (likeBean.getOk() == -100) { throw new TaskException("-100", "未登录"); } else { throw new TaskException("", likeBean.getMsg()); } } } catch (Exception e) { if (e instanceof TaskException) throw (TaskException) e; e.printStackTrace(); } throw new TaskException(TaskException.TaskError.timeout.toString()); }
public boolean loginBySavedCookies() { loginCookies.clear(); readCookies(EzraPoundUtil.LOGIN_COOKIES_DIR, loginCookies); Connection con = JsoupUtil.getGetCon("https://www.zhihu.com"); Response rs = null; try { rs = con.cookies(loginCookies).execute(); } catch (IOException e) { e.printStackTrace(); log.info("携带cookie登录测试失败"); return false; } return checkLogin(Jsoup.parse(rs.body())); }
private static Response execute( String url, Method method, Map<String, String> cookies, Map<String, String> data) { Response response = null; Connection connection = Jsoup.connect(url); connection.method(method); connection.timeout(10000); connection.ignoreContentType(true); connection.maxBodySize(0); if (cookies != null) { connection.cookies(cookies); } if (data != null) { for (Entry<String, String> entry : data.entrySet()) { String key = entry.getKey(); String value = entry.getValue(); connection.data(key, value); } } try { response = connection.execute(); } catch (IOException e) { e.printStackTrace(); } return response; }
public void download(Connection aInConnection, Collection<Image> images) throws IOException { aInConnection.url(url); Document lDocument = aInConnection.get(); Element lMain = lDocument.getElementById("main"); Elements lContents = lMain.getElementsByClass("content"); if (lContents.size() == 1) { StringBuilder sb = new StringBuilder(); Element lContent = lContents.first(); collectImages(lContent, images); Elements lLightboxElements = lContent.getElementsByClass("lightbox"); for (Element lLightboxElement : lLightboxElements) { Collection<Node> lImageNodes = extractImageNodes(lLightboxElement); Element lParent = lLightboxElement.parent(); int i = lLightboxElement.siblingIndex(); lParent.insertChildren(i, lImageNodes); lLightboxElement.remove(); } Elements lChildElements = lContent.children(); for (Element lChildElement : lChildElements) { if (lChildElement.hasClass("clear")) { // no more post content break; } if (title == null && lChildElement.tagName().equals("h1")) { // the first h1 header is the title title = lChildElement.html(); } else { if (excerpt == null && lChildElement.tagName().equals("p")) { excerpt = lChildElement.text(); } String lStr = lChildElement.toString(); sb.append(lStr); } } content = sb.toString(); Elements lDateElements = lContent.getElementsByClass("date"); String lHunDate = lDateElements.first().html(); date = new PostDate(lHunDate); } else { System.out.println("More than one content in main section of post page " + toString()); } }
public boolean getXsrf(int times) { if (times > maxRecursiveTimes) return false; Connection con = JsoupUtil.getGetCon("http://www.zhihu.com"); Response rs = null; try { rs = con.execute(); } catch (IOException e) { e.printStackTrace(); log.info("获取_xsrf第" + times + "次失败"); return getXsrf(++times); } Document doc = Jsoup.parse(rs.body()); xsrf = doc.select(".view.view-signin [name=\"_xsrf\"]").attr("value"); log.info("已获得xsrf:" + xsrf); return true; }
protected Document getDocument() { if (_document != null) { return _document; } Connection connection = Jsoup.connect(_url); try { _document = connection.post(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } return _document; }
public void crawl(String url) { try { Connection connection = Jsoup.connect(url).userAgent(USER_AGENT); Document htmlPage = connection.get(); this.htmlPage = htmlPage; System.out.println("WEB PAGE FOUND AT " + url); Elements linksOnPage = htmlPage.select("a[href]"); System.out.println("FOUND (" + linksOnPage.size() + ") links"); for (Element link : linksOnPage) { this.links.add(link.absUrl("href")); } } catch (IOException e) { System.out.println("ERROR: " + e); } }
@Override protected List<GiveawayGroup> doInBackground(Void... params) { Log.d(TAG, "Fetching giveaways for page " + page); try { // Fetch the Giveaway page Connection jsoup = Jsoup.connect("http://www.steamgifts.com/giveaway/" + path + "/groups/search") .userAgent(Constants.JSOUP_USER_AGENT) .timeout(Constants.JSOUP_TIMEOUT); jsoup.data("page", Integer.toString(page)); if (SteamGiftsUserData.getCurrent(fragment.getContext()).isLoggedIn()) jsoup.cookie( "PHPSESSID", SteamGiftsUserData.getCurrent(fragment.getContext()).getSessionId()); Document document = jsoup.get(); SteamGiftsUserData.extract(fragment.getContext(), document); // Parse all rows of groups Elements groups = document.select(".table__row-inner-wrap"); Log.d(TAG, "Found inner " + groups.size() + " elements"); List<GiveawayGroup> groupList = new ArrayList<>(); for (Element element : groups) { Element link = element.select(".table__column__heading").first(); // Basic information String title = link.text(); String id = link.attr("href").substring(7, 12); String avatar = null; Element avatarNode = element.select(".global__image-inner-wrap").first(); if (avatarNode != null) avatar = Utils.extractAvatar(avatarNode.attr("style")); GiveawayGroup group = new GiveawayGroup(id, title, avatar); groupList.add(group); } return groupList; } catch (IOException e) { Log.e(TAG, "Error fetching URL", e); return null; } }
public static Map<String, String> login( HttpServletRequest req, HttpServletResponse res, String username, String password) throws Exception { String url = ReadProperties.getByName("login.ip") + "/login"; Map<String, String> datas = new HashMap<String, String>(); Map<String, String> cookies = new HashMap<String, String>(); Connection con = Jsoup.connect(url).timeout(120000); // 获取连接 con.header( "User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:29.0) Gecko/20100101 Firefox/29.0"); // 配置模拟浏览器 Response rs; rs = con.execute(); cookies = rs.cookies(); Document doc = Jsoup.parse(rs.body()); // 转换为Dom树 List<Element> et = doc.select("form"); // 获取form表单,可以通过查看页面源码代码得知 for (Element e : et.get(0).getAllElements()) { if (e.attr("name").equals("username")) { e.attr("value", username); // 设置用户名 } if (e.attr("name").equals("password")) { e.attr("value", password); // 设置用户密码 } if (e.attr("name").length() > 0) { // 排除空值表单属性 datas.put(e.attr("name"), e.attr("value")); } } // 设置cookie和post上面的map数据 Response login = null; login = con.data(datas).cookies(cookies).method(Method.POST).execute(); url = ReadProperties.getByName("common.ip") + req.getContextPath() + "/user/getUser"; con = Jsoup.connect(url) .cookies(login.cookies()) .ignoreContentType(true) .method(Method.GET); // 获取连接 rs = con.execute(); for (Entry<String, String> entry : rs.cookies().entrySet()) { Cookie cookie = new Cookie(entry.getKey(), entry.getValue()); cookie.setPath(req.getContextPath() + "/"); res.addCookie(cookie); } return JsonUtil.jsonToObject(rs.body(), Map.class); }
public boolean loginByEmailAndPwd() { loginCookies.clear(); Scanner sc = new Scanner(System.in); getCaptchaImgAndCookies(0); log.info("请输入账号:"); email = sc.nextLine(); log.info("请输入密码"); password = sc.nextLine(); log.info("查看验证码并输入"); captcha = sc.nextLine(); Connection con = JsoupUtil.getPostCon("https://www.zhihu.com/login/email"); Response rs = null; try { rs = con.data("_xsrf", xsrf) .data("email", email) .data("password", password) .data("remember_me", remeberMe) .data("captcha", captcha) .cookies(captchaCookies) .ignoreContentType(true) .execute(); } catch (IOException e) { e.printStackTrace(); log.info("通过账号密码登录发生异常"); return false; } JSONObject jsonObject = new JSONObject(rs.body()); String result = jsonObject.get("r").toString(); log.info(EzraPoundUtil.unicode2Character(jsonObject.get("msg").toString())); Response rs2 = null; try { rs2 = JsoupUtil.getGetCon("https://www.zhihu.com").cookies(rs.cookies()).execute(); } catch (IOException e) { e.printStackTrace(); } if (checkLogin(Jsoup.parse(rs2.body()))) { loginCookies.putAll(rs.cookies()); saveCookies(EzraPoundUtil.LOGIN_COOKIES_DIR, loginCookies); return true; } return false; }
public static void getVOSA() throws IOException { Document doc; // need http protocol Connection connection = Jsoup.connect(VOSA_URL); connection.timeout(30000); doc = connection.get(); // get all links Elements links = doc.select("a.top_link"); for (Element link : links) { // get the value from href attribute System.out.println("\nlink : " + link.attr("href")); System.out.println("text : " + link.text()); } }
/** * @param connection Jsoup connection object * @param method HTTP method * @return Jsoup Connection.Response object */ public Connection.Response execute(Connection connection, Connection.Method method) { Connection.Response response; if (method != null) { connection.method(method); } try { System.out.println("Calling " + connection.request().url()); if (props.getMode() == Mode.TEST) { return null; } response = connection.execute(); } catch (IOException e) { e.printStackTrace(); return null; } this.cookies.putAll(response.cookies()); return response; }
public URL[] getURLs() throws Exception { List<URL> urls = new ArrayList<>(); String query = term + " filetype:" + filetype; URL google = new URL( "http://www.google.com/search?q=" + URLEncoder.encode(query, "UTF-8") + "&start=" + start); Connection con = HttpConnection.connect(google); con.timeout(60000); con.userAgent(""); Document doc = con.get(); Elements els = doc.select("cite"); for (Element el : els) { String text = el.text(); if (!text.startsWith("http")) { text = "http://" + text; } URL url = new URL(text); urls.add(url); } return (URL[]) urls.toArray(new URL[urls.size()]); }
public Document docGet_URL(String url) { Document doc; Connection conn; int timeout = CONNECT_TIMEOUT_INI; conn = null; try { conn = Jsoup.connect(url); } catch (Exception _) { ; // conn is null at this point } // end of [try] if (conn == null) return null; while (true) { conn = conn.timeout(timeout); try { return conn.get(); } catch (IOException _) { timeout = 2 * timeout; if (timeout > CONNECT_TIMEOUT_FIN) { _geterr(url); break; } } // end of [try] } return null; // HX: max timeout reached at this point }
/** * Parses the given feed and extracts out and parsers all linked items within the feed, using the * underlying ROME feed parsing library. * * @param rss A {@link Content} object representing the feed that is being parsed by this {@link * Parser}. * @return A {@link ParseResult} containing all {@link Parse}d feeds that were present in the feed * file that this {@link Parser} dealt with. */ public String getParse(String url, boolean json, HashMap<String, Object> params) throws Exception { url = URLDecoder.decode(url, "UTF-8"); URL feedURL = new URL(url); // Logger.getLogger(this.getClass().getName()).log(Level.INFO, "Encoding del Feed: {0}", new // Object[]{feedURL.openConnection().getContentEncoding()}); Feed feed = FeedParser.parse(feedURL); // List<PostType> newsList = new ArrayList<PostType>(); PostType newEntry; // PostType newEntryComments; Post newEntrySolr; // SyndFeed feed = null; Gson gson = new Gson(); List<LinkType> links; Document doc; FeedSelectors feedSelectors; String extendedString = (String) params.get("zone"); Place place = null; if (params.containsKey("place")) { place = placeDao.retrieveByExtendedString(extendedString); } org.zonales.tagsAndZones.objects.Zone zone = zoneDao.retrieveByExtendedString(extendedString); if (!json) { for (int i = 0; i < feed.getItemCount(); i++) { FeedItem entry = feed.getItem(i); Logger.getLogger(this.getClass().getName()) .log( Level.INFO, "Intentando conectar a {0}", new Object[] {entry.getLink().toString()}); Connection conn = Jsoup.connect(entry.getLink().toString()); conn.timeout(60000); doc = conn.get(); String responseURL = conn.response().url().getHost(); // doc = Jsoup.connect(entry.getLink().toString()).timeout(60000).get(); Logger.getLogger(this.getClass().getName()) .log(Level.INFO, "Parseando la URL: {0}", new Object[] {entry.getLink().toString()}); feedSelectors = dao.retrieve(url); if (findWords( entry.getTitle(), doc, (ArrayList) params.get("searchlist"), (ArrayList) params.get("blacklist"), feedSelectors)) { newEntry = new PostType(); String source; if (feed.getHeader() == null || feed.getHeader().getLink() == null) { Logger.getLogger(this.getClass().getName()).log(Level.INFO, "NULL: Link"); source = feedURL.getHost(); } else { Logger.getLogger(this.getClass().getName()) .log(Level.INFO, "NO NULL: {0}", feed.getHeader().getLink().toString()); source = feed.getHeader().getLink().getHost(); // if (source.indexOf("/") != -1) { // source = source.substring(0, source.indexOf("/") + 1); // } } newEntry.setSource(source); newEntry.setDocType("post"); newEntry.setZone( new Zone( String.valueOf(zone.getId()), zone.getName(), zone.getType().getName(), zone.getExtendedString())); newEntry.setPostLatitude(Double.parseDouble((String) params.get("latitud"))); newEntry.setPostLongitude(Double.parseDouble((String) params.get("longitud"))); // newEntry.setId(entry.getUri()); // newEntry.setId(entry.getUri() != null && entry.getUri().length() > 0 ? // entry.getUri().trim() : entry.getLink().trim()+entry.getTitle().trim()); newEntry.setId( entry.getGUID() != null ? entry.getGUID() : (entry.getElementValue("http://www.w3.org/2005/Atom", "id") != null ? entry.getElementValue("http://www.w3.org/2005/Atom", "id") : (entry.getTitle()))); newEntry.setFromUser( new User( null, source, null, null, place != null ? new org.zonales.entities.Place( String.valueOf(place.getId()), place.getName(), place.getType().getName()) : null)); newEntry.setTitle(entry.getTitle()); newEntry.setText(entry.getDescriptionAsText()); newEntry.setTags(new TagsType((ArrayList) params.get("tagslist"))); if (newEntry.getLinks() == null) { newEntry.setLinks(new LinksType(new ArrayList<LinkType>())); } if ((links = getLinks(feedSelectors, doc, responseURL)) != null) { newEntry.getLinks().getLink().addAll(links); } newEntry.getLinks().getLink().add(new LinkType("source", entry.getLink().toString())); if (newEntry.getActions() == null) { newEntry.setActions(new ActionsType(new ArrayList<ActionType>())); } newEntry.setActions( new ActionsType( getActions( feedSelectors, doc, newEntry.getId(), json, (Boolean) params.get("comments"), source))); if (entry.getPubDate() != null) { newEntry.setCreated(String.valueOf(entry.getPubDate().getTime())); } if (entry.getModDate() != null) { newEntry.setModified(String.valueOf(entry.getModDate().getTime())); } for (ActionType action : newEntry.getActions().getAction()) { if ("comments".equals(action.getType())) { newEntry.setRelevance(action.getCant()); } } if (!json) { newEntry.setVerbatim(gson.toJson(newEntry)); } newsList.add(newEntry); // addToMap(parseResult, feed, feedLink, entry, content, newEntry); } } PostsType news; news = new PostsType(newsList); completeLinks(news); Feed2XML(news, sw); return sw.toString(); // + comments.toString(); } else { for (int i = 0; i < feed.getItemCount(); i++) { FeedItem entry = feed.getItem(i); Logger.getLogger(this.getClass().getName()) .log( Level.INFO, "Intentando conectar a {0}", new Object[] {entry.getLink().toString()}); Connection conn = Jsoup.connect(entry.getLink().toString()); conn.timeout(60000); doc = conn.get(); String responseURL = conn.response().url().getHost(); // Logger.getLogger(this.getClass().getName()).log(Level.INFO, "RESPONSE URL: // {0}", responseURL); // doc = Jsoup.connect(entry.getLink().toString()).timeout(60000).get(); Logger.getLogger(this.getClass().getName()) .log(Level.INFO, "Parseando la URL: {0}", new Object[] {entry.getLink().toString()}); feedSelectors = dao.retrieve(url); if (findWords( entry.getTitle(), doc, (ArrayList) params.get("searchlist"), (ArrayList) params.get("blacklist"), feedSelectors)) { newEntrySolr = new Post(); String source; if (feed.getHeader() == null || feed.getHeader().getLink() == null) { Logger.getLogger(this.getClass().getName()).log(Level.INFO, "NULL: Link"); source = feedURL.getHost(); } else { Logger.getLogger(this.getClass().getName()) .log(Level.INFO, "NO NULL: {0}", feed.getHeader().getLink().toString()); source = feed.getHeader().getLink().getHost(); // if (source.indexOf("/") != -1) { // source = source.substring(0, source.indexOf("/") + 1); // } } newEntrySolr.setSource(source); newEntrySolr.setDocType("post"); newEntrySolr.setZone( new Zone( String.valueOf(zone.getId()), zone.getName(), zone.getType().getName(), zone.getExtendedString())); newEntrySolr.setPostLatitude(Double.parseDouble((String) params.get("latitud"))); newEntrySolr.setPostLongitude(Double.parseDouble((String) params.get("longitud"))); // newEntry.setId(entry.getUri()); // newEntry.setId(entry.getUri() != null && entry.getUri().length() > 0 ? // entry.getUri().trim() : entry.getLink().trim()+entry.getTitle().trim()); newEntrySolr.setId( entry.getGUID() != null ? entry.getGUID() : (entry.getElementValue("http://www.w3.org/2005/Atom", "id") != null ? entry.getElementValue("http://www.w3.org/2005/Atom", "id") : (entry.getTitle()))); newEntrySolr.setFromUser( new User( null, source, null, null, place != null ? new org.zonales.entities.Place( String.valueOf(place.getId()), place.getName(), place.getType().getName()) : null)); newEntrySolr.setTitle(entry.getTitle()); newEntrySolr.setText(entry.getDescriptionAsText()); newEntrySolr.setTags(new ArrayList<String>((ArrayList) params.get("tagslist"))); if (newEntrySolr.getLinks() == null) { newEntrySolr.setLinks(new ArrayList<LinkType>()); } if ((links = getLinks(feedSelectors, doc, responseURL)) != null) { newEntrySolr.getLinks().addAll(links); } newEntrySolr.getLinks().add(new LinkType("source", entry.getLink().toString())); if (newEntrySolr.getActions() == null) { newEntrySolr.setActions(new ArrayList<ActionType>()); } newEntrySolr .getActions() .addAll( getActions( feedSelectors, doc, newEntrySolr.getId(), json, (Boolean) params.get("comments"), source)); if (entry.getPubDate() != null) { newEntrySolr.setCreated((entry.getPubDate().getTime())); } if (entry.getModDate() != null) { newEntrySolr.setModified((entry.getModDate().getTime())); } for (ActionType action : newEntrySolr.getActions()) { if ("comments".equals(action.getType())) { newEntrySolr.setRelevance(action.getCant()); } } if (!json) { newEntrySolr.setVerbatim(gson.toJson(newEntrySolr)); } newsListSolr.add(newEntrySolr); // addToMap(parseResult, feed, feedLink, entry, content, newEntry); } } return "{post: " + gson.toJson(newsListSolr) + "}"; // + comments.toString(); } }
@Override public void addExtraParameters(Connection connection) { connection.data("code", giveawayId); }
public static void getDetails(MajorForCollection major) throws Exception { Connection conn = Jsoup.connect(major.getUrl()); Document doc = conn.timeout(10000).followRedirects(true).get(); Element e = null; if (doc.select("table.course-page__table-basic").size() > 0) { e = doc.select("table.course-page__table-basic").get(0); for (Element tr : e.select("tr")) { if (tr.text().contains("Duration")) { major.setLength(getLength(e.text())); } else if (tr.text().contains("Start date")) { major.setMonthOfEntry(getMonthOfEntry(e.text())); } } } if (doc.select("a.btn.btn-bordered").size() > 0) { e = doc.select("a.btn.btn-bordered").get(0); major.setApplicationFee(e.attr("href")); } if (doc.select("#entry-requirements-2").size() > 0) { e = doc.select("#entry-requirements-2").get(0); major.setAcademicRequirements(e.text()); } if (doc.select("div.course-page.row a").size() > 0) { e = doc.select("div.course-page.row a").last(); major.setSchool(e.attr("href")); if (major .getSchool() .equals("http://www.study.monash/media/links/faculty-websites/design-and-architecture")) { major.setSchool("Monash Art Design & Architecture"); } else if (major .getSchool() .equals("http://www.study.monash/media/links/faculty-websites/business-and-economics")) { major.setSchool("Monash Business School"); } else if (major .getSchool() .equals("http://www.study.monash/media/links/faculty-websites/arts")) { major.setSchool("Faculty of Arts, Monash University"); } else if (major .getSchool() .equals("http://www.study.monash/media/links/faculty-websites/science")) { major.setSchool("Faculty of Science"); } else if (major .getSchool() .equals("http://www.study.monash/media/links/faculty-websites/medicine")) { major.setSchool("Faculty of Medicine, Nursing and Health Sciences"); } else if (major .getSchool() .equals("http://www.study.monash/media/links/faculty-websites/education")) { major.setSchool("Faculty of Education - Faculty of Education"); } else if (major .getSchool() .equals("http://www.study.monash/media/links/faculty-websites/engineering")) { major.setSchool("Faculty of Engineering, Monash University"); } else if (major .getSchool() .equals("http://www.study.monash/media/links/faculty-websites/information-technology")) { major.setSchool("Faculty of Information Technology - Monash University"); } else if (major .getSchool() .equals("http://www.study.monash/media/links/faculty-websites/pharmacy")) { major.setSchool("Faculty of Pharmacy and Pharmaceutical Sciences"); } else if (major .getSchool() .equals("http://www.study.monash/media/links/faculty-websites/law")) { major.setSchool("Faculty of Law"); } } if (doc.select("#fees").size() > 0) { e = doc.select("#fees").get(0); major.setTuitionFee(e.nextElementSibling().text()); } if (!major .getApplicationFee() .equals("http://www.monash.edu.au/pubs/handbooks/courses/A6015.html") && !major .getApplicationFee() .equals("http://www.monash.edu.au/pubs/handbooks/courses/2276.html")) { doc = WebUtils.getDocument(major.getApplicationFee(), WebUtils.METHOD_GET, 10 * 1000); if (doc.select("h2.black.pub_heading:containsOwn(Requirements) + div.pub_body_text").size() > 0) { e = doc.select("h2.black.pub_heading:containsOwn(Requirements) + div.pub_body_text").get(0); major.setStructure(replaceSpecialCharacter(html2Str(e.outerHtml())).trim()); if (major.getStructure().contains("Part A.")) { major.setStructure( major.getStructure().substring(major.getStructure().indexOf("Part A."))); } else if (doc.select("h2.black.pub_heading:containsOwn(Structure) + div.pub_body_text") .size() > 0) { e = doc.select("h2.black.pub_heading:containsOwn(Structure) + div.pub_body_text").get(0); major.setStructure(replaceSpecialCharacter(html2Str(e.text())).trim()); } } } mark(major, true); }
@Override public void execute(MessageEvent event, String[] args) { User sender = event.getUser(); Channel channel = event.getChannel(); if (args.length > 0) { String plugin = args[0].toLowerCase(); String url = String.format( "http://api.bukget.org/3/search/plugin_name/like/%s%s", plugin, (args.length) == 1 ? "" : ("?size=" + args[1])); Connection conn = Jsoup.connect(url).timeout(500).followRedirects(true).ignoreContentType(true); String json; try { json = conn.get().text(); } catch (IOException ex) { foxbot.log(ex); channel .send() .message( Utils.colourise( String.format( "(%s) &cAn error occurred while querying the Bukget API!", Utils.munge(sender.getNick())))); return; } if (json.equals("[]")) { channel .send() .message( Utils.colourise( String.format("(%s) &cNo results found!", Utils.munge(sender.getNick())))); return; } JSONArray jsonArray = new JSONArray(json); JSONObject found = null; for (int i = 0; i < jsonArray.length(); i++) { JSONObject jsonObject = jsonArray.getJSONObject(i); String name = jsonObject.getString("plugin_name"); if (name.equalsIgnoreCase(plugin)) { found = jsonObject; break; } } if (found == null) { found = jsonArray.getJSONObject(0); } String name = found.getString("plugin_name"); String description = found.getString("description"); String pluginUrl = String.format("http://dev.bukkit.org/bukkit-plugins/%s/", found.getString("slug")); if (description.isEmpty()) { description = "No description"; } channel .send() .message( Utils.colourise( String.format( "(%s) &2Name:&r %s &2Description:&r %s &2URL:&r %s", Utils.munge(sender.getNick()), name, description, pluginUrl))); return; } foxbot.sendNotice( sender, String.format( "Wrong number of args! Use %sbukkitsearch <plugin>", foxbot.getConfig().getCommandPrefix())); }
@Override protected String doProcess(File htmlfile, String originalUrl, Intent intent) { try { // String charset = "utf-8"; Connection coon = HttpConnection.connect(originalUrl); coon.followRedirects( false); // we don't want it be redirected to other page,example: 10.254.7.4 Document doc = coon.get(); Element head = doc.head(); Element body = doc.body(); if (body.children().size() == 0) { Log.e(TAG, "body has no child with url=" + originalUrl); return PROCESS_FAILED_URL; } /* Elements meta = head.select("meta"); if(!meta.isEmpty()){ Element m = meta.get(0); String content = m.attr("content"); String attr = content.substring(content.indexOf("charset=")+8); if(!attr.trim().isEmpty()){ charset = attr; } } */ Elements base = head.select("base"); if (base.isEmpty()) { String b = head.baseUri(); Attributes attrs = new Attributes(); attrs.put("href", b); ArrayList<Element> a = new ArrayList<>(); a.add(new Element(Tag.valueOf("base"), b, attrs)); head.insertChildren(0, a); } Element div = doc.select("div.content-main").first(); if (div == null) { Log.e(TAG, "not found specific element with url=" + originalUrl); return PROCESS_FAILED_URL; } Element title = div.select("h1.title").first(); title.remove(); body.empty(); ArrayList<Element> a = new ArrayList<>(); a.add(div); body.insertChildren(0, a); int g = 0; while (g < 2) { // try two times. if (FileUtil.saveStringToFile(doc.toString(), htmlfile, false)) { break; } g++; } if (g < 2) return StringUtils.file2Url(htmlfile, PROCESS_FAILED_URL); Log.e(TAG, "save html to file failed with url=" + originalUrl); } catch (MalformedURLException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } return PROCESS_FAILED_URL; }