private static Response execute( String url, Method method, Map<String, String> cookies, Map<String, String> data) { Response response = null; Connection connection = Jsoup.connect(url); connection.method(method); connection.timeout(10000); connection.ignoreContentType(true); connection.maxBodySize(0); if (cookies != null) { connection.cookies(cookies); } if (data != null) { for (Entry<String, String> entry : data.entrySet()) { String key = entry.getKey(); String value = entry.getValue(); connection.data(key, value); } } try { response = connection.execute(); } catch (IOException e) { e.printStackTrace(); } return response; }
public static String getType(Document doc) { String type = ""; if (doc.select("#kw").size() > 0) { Element e = doc.select("#kw").get(0); StringBuilder typeURL = new StringBuilder(); typeURL.append("http://widget.unistats.ac.uk/Widget/"); typeURL.append(e.attr("data-institution") + "/"); typeURL.append(e.attr("data-course") + "/"); typeURL.append(e.attr("data-orientation") + "/"); typeURL.append("null/"); typeURL.append(e.attr("data-language") + "/"); typeURL.append(e.attr("data-kismode")); boolean finishe = false; try { do { Connection tmpConn = Jsoup.connect(typeURL.toString()); Document tmpDoc = tmpConn.timeout(10000).get(); if (tmpDoc.select("#kisWidget > div.widgetCourse > h1").size() > 0) { e = tmpDoc.select("#kisWidget > div.widgetCourse > h1").get(0); type = e.text().trim().indexOf(" ") > 0 ? e.text().trim().substring(0, e.text().trim().indexOf(" ")) : e.text().trim(); } finishe = true; } while (!finishe); } catch (Exception ex) { ex.printStackTrace(); } } return type; }
public static void initMajorList(String originalUrl) { System.out.println("preparing majorList"); boolean finish = false; do { try { majorList.clear(); Connection conn = Jsoup.connect(originalUrl); Document doc = conn.timeout(10000).get(); Elements es = doc.select("#accordion__target-3 > div.course-listing__box > a"); for (Element e : es) { // major MajorForCollection major = new MajorForCollection(); major.setLevel(LEVEL); major.setTitle(e.select("h3").get(0).text().trim()); major.setType(e.select("p").get(0).text().replaceAll("-[\\s\\S]*", "").trim()); major.setUrl(e.select("a").get(0).attr("href")); majorList.add(major); } ; finish = true; } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } } while (!finish); System.out.println("majorList prepared"); System.out.println("majorList size: " + majorList.size()); }
public static String requestFee(String url) { boolean finish = false; String fee = ""; do { try { Connection conn = Jsoup.connect(url); Document doc = conn.timeout(5000).get(); if (doc.select("#block-system-main > table > tbody > tr:nth-child(2) > td:nth-child(4)") .size() > 0) { fee = getFee( doc.select( "#block-system-main > table > tbody > tr:nth-child(2) > td:nth-child(4)") .text()); } else { fee = url; } finish = true; } catch (IOException e) { System.out.println("requestFee : " + e.getMessage()); } } while (!finish); return fee; }
public static String extractContent(String url) { try { Connection connection = Jsoup.connect(url); connection.userAgent(USER_AGENT); connection.followRedirects(true); connection.timeout(GET_TIMEOUT); long start = System.currentTimeMillis(); Connection.Response response = connection.execute(); long diff = System.currentTimeMillis() - start; int responseCode = response.statusCode(); if (response.statusCode() == OK) { String body = response.body(); Logger.info( "%s retrieved, content length %d, time %s sec.", url, body.length(), FormatUtil.millis2Seconds(diff)); return response.body(); } else { Logger.error("%s returned %d", url, responseCode); return ""; } } catch (IOException e) { Logger.error(e, "%s cannot be read.", url); return ""; } }
public static void getVOSA() throws IOException { Document doc; // need http protocol Connection connection = Jsoup.connect(VOSA_URL); connection.timeout(30000); doc = connection.get(); // get all links Elements links = doc.select("a.top_link"); for (Element link : links) { // get the value from href attribute System.out.println("\nlink : " + link.attr("href")); System.out.println("text : " + link.text()); } }
public URL[] getURLs() throws Exception { List<URL> urls = new ArrayList<>(); String query = term + " filetype:" + filetype; URL google = new URL( "http://www.google.com/search?q=" + URLEncoder.encode(query, "UTF-8") + "&start=" + start); Connection con = HttpConnection.connect(google); con.timeout(60000); con.userAgent(""); Document doc = con.get(); Elements els = doc.select("cite"); for (Element el : els) { String text = el.text(); if (!text.startsWith("http")) { text = "http://" + text; } URL url = new URL(text); urls.add(url); } return (URL[]) urls.toArray(new URL[urls.size()]); }
public Document docGet_URL(String url) { Document doc; Connection conn; int timeout = CONNECT_TIMEOUT_INI; conn = null; try { conn = Jsoup.connect(url); } catch (Exception _) { ; // conn is null at this point } // end of [try] if (conn == null) return null; while (true) { conn = conn.timeout(timeout); try { return conn.get(); } catch (IOException _) { timeout = 2 * timeout; if (timeout > CONNECT_TIMEOUT_FIN) { _geterr(url); break; } } // end of [try] } return null; // HX: max timeout reached at this point }
public static void getDetails(MajorForCollection major) throws Exception { Connection conn = Jsoup.connect(major.getUrl()); Document doc = conn.timeout(10000).followRedirects(true).get(); Element e = null; if (doc.select("table.course-page__table-basic").size() > 0) { e = doc.select("table.course-page__table-basic").get(0); for (Element tr : e.select("tr")) { if (tr.text().contains("Duration")) { major.setLength(getLength(e.text())); } else if (tr.text().contains("Start date")) { major.setMonthOfEntry(getMonthOfEntry(e.text())); } } } if (doc.select("a.btn.btn-bordered").size() > 0) { e = doc.select("a.btn.btn-bordered").get(0); major.setApplicationFee(e.attr("href")); } if (doc.select("#entry-requirements-2").size() > 0) { e = doc.select("#entry-requirements-2").get(0); major.setAcademicRequirements(e.text()); } if (doc.select("div.course-page.row a").size() > 0) { e = doc.select("div.course-page.row a").last(); major.setSchool(e.attr("href")); if (major .getSchool() .equals("http://www.study.monash/media/links/faculty-websites/design-and-architecture")) { major.setSchool("Monash Art Design & Architecture"); } else if (major .getSchool() .equals("http://www.study.monash/media/links/faculty-websites/business-and-economics")) { major.setSchool("Monash Business School"); } else if (major .getSchool() .equals("http://www.study.monash/media/links/faculty-websites/arts")) { major.setSchool("Faculty of Arts, Monash University"); } else if (major .getSchool() .equals("http://www.study.monash/media/links/faculty-websites/science")) { major.setSchool("Faculty of Science"); } else if (major .getSchool() .equals("http://www.study.monash/media/links/faculty-websites/medicine")) { major.setSchool("Faculty of Medicine, Nursing and Health Sciences"); } else if (major .getSchool() .equals("http://www.study.monash/media/links/faculty-websites/education")) { major.setSchool("Faculty of Education - Faculty of Education"); } else if (major .getSchool() .equals("http://www.study.monash/media/links/faculty-websites/engineering")) { major.setSchool("Faculty of Engineering, Monash University"); } else if (major .getSchool() .equals("http://www.study.monash/media/links/faculty-websites/information-technology")) { major.setSchool("Faculty of Information Technology - Monash University"); } else if (major .getSchool() .equals("http://www.study.monash/media/links/faculty-websites/pharmacy")) { major.setSchool("Faculty of Pharmacy and Pharmaceutical Sciences"); } else if (major .getSchool() .equals("http://www.study.monash/media/links/faculty-websites/law")) { major.setSchool("Faculty of Law"); } } if (doc.select("#fees").size() > 0) { e = doc.select("#fees").get(0); major.setTuitionFee(e.nextElementSibling().text()); } if (!major .getApplicationFee() .equals("http://www.monash.edu.au/pubs/handbooks/courses/A6015.html") && !major .getApplicationFee() .equals("http://www.monash.edu.au/pubs/handbooks/courses/2276.html")) { doc = WebUtils.getDocument(major.getApplicationFee(), WebUtils.METHOD_GET, 10 * 1000); if (doc.select("h2.black.pub_heading:containsOwn(Requirements) + div.pub_body_text").size() > 0) { e = doc.select("h2.black.pub_heading:containsOwn(Requirements) + div.pub_body_text").get(0); major.setStructure(replaceSpecialCharacter(html2Str(e.outerHtml())).trim()); if (major.getStructure().contains("Part A.")) { major.setStructure( major.getStructure().substring(major.getStructure().indexOf("Part A."))); } else if (doc.select("h2.black.pub_heading:containsOwn(Structure) + div.pub_body_text") .size() > 0) { e = doc.select("h2.black.pub_heading:containsOwn(Structure) + div.pub_body_text").get(0); major.setStructure(replaceSpecialCharacter(html2Str(e.text())).trim()); } } } mark(major, true); }
/** * Parses the given feed and extracts out and parsers all linked items within the feed, using the * underlying ROME feed parsing library. * * @param rss A {@link Content} object representing the feed that is being parsed by this {@link * Parser}. * @return A {@link ParseResult} containing all {@link Parse}d feeds that were present in the feed * file that this {@link Parser} dealt with. */ public String getParse(String url, boolean json, HashMap<String, Object> params) throws Exception { url = URLDecoder.decode(url, "UTF-8"); URL feedURL = new URL(url); // Logger.getLogger(this.getClass().getName()).log(Level.INFO, "Encoding del Feed: {0}", new // Object[]{feedURL.openConnection().getContentEncoding()}); Feed feed = FeedParser.parse(feedURL); // List<PostType> newsList = new ArrayList<PostType>(); PostType newEntry; // PostType newEntryComments; Post newEntrySolr; // SyndFeed feed = null; Gson gson = new Gson(); List<LinkType> links; Document doc; FeedSelectors feedSelectors; String extendedString = (String) params.get("zone"); Place place = null; if (params.containsKey("place")) { place = placeDao.retrieveByExtendedString(extendedString); } org.zonales.tagsAndZones.objects.Zone zone = zoneDao.retrieveByExtendedString(extendedString); if (!json) { for (int i = 0; i < feed.getItemCount(); i++) { FeedItem entry = feed.getItem(i); Logger.getLogger(this.getClass().getName()) .log( Level.INFO, "Intentando conectar a {0}", new Object[] {entry.getLink().toString()}); Connection conn = Jsoup.connect(entry.getLink().toString()); conn.timeout(60000); doc = conn.get(); String responseURL = conn.response().url().getHost(); // doc = Jsoup.connect(entry.getLink().toString()).timeout(60000).get(); Logger.getLogger(this.getClass().getName()) .log(Level.INFO, "Parseando la URL: {0}", new Object[] {entry.getLink().toString()}); feedSelectors = dao.retrieve(url); if (findWords( entry.getTitle(), doc, (ArrayList) params.get("searchlist"), (ArrayList) params.get("blacklist"), feedSelectors)) { newEntry = new PostType(); String source; if (feed.getHeader() == null || feed.getHeader().getLink() == null) { Logger.getLogger(this.getClass().getName()).log(Level.INFO, "NULL: Link"); source = feedURL.getHost(); } else { Logger.getLogger(this.getClass().getName()) .log(Level.INFO, "NO NULL: {0}", feed.getHeader().getLink().toString()); source = feed.getHeader().getLink().getHost(); // if (source.indexOf("/") != -1) { // source = source.substring(0, source.indexOf("/") + 1); // } } newEntry.setSource(source); newEntry.setDocType("post"); newEntry.setZone( new Zone( String.valueOf(zone.getId()), zone.getName(), zone.getType().getName(), zone.getExtendedString())); newEntry.setPostLatitude(Double.parseDouble((String) params.get("latitud"))); newEntry.setPostLongitude(Double.parseDouble((String) params.get("longitud"))); // newEntry.setId(entry.getUri()); // newEntry.setId(entry.getUri() != null && entry.getUri().length() > 0 ? // entry.getUri().trim() : entry.getLink().trim()+entry.getTitle().trim()); newEntry.setId( entry.getGUID() != null ? entry.getGUID() : (entry.getElementValue("http://www.w3.org/2005/Atom", "id") != null ? entry.getElementValue("http://www.w3.org/2005/Atom", "id") : (entry.getTitle()))); newEntry.setFromUser( new User( null, source, null, null, place != null ? new org.zonales.entities.Place( String.valueOf(place.getId()), place.getName(), place.getType().getName()) : null)); newEntry.setTitle(entry.getTitle()); newEntry.setText(entry.getDescriptionAsText()); newEntry.setTags(new TagsType((ArrayList) params.get("tagslist"))); if (newEntry.getLinks() == null) { newEntry.setLinks(new LinksType(new ArrayList<LinkType>())); } if ((links = getLinks(feedSelectors, doc, responseURL)) != null) { newEntry.getLinks().getLink().addAll(links); } newEntry.getLinks().getLink().add(new LinkType("source", entry.getLink().toString())); if (newEntry.getActions() == null) { newEntry.setActions(new ActionsType(new ArrayList<ActionType>())); } newEntry.setActions( new ActionsType( getActions( feedSelectors, doc, newEntry.getId(), json, (Boolean) params.get("comments"), source))); if (entry.getPubDate() != null) { newEntry.setCreated(String.valueOf(entry.getPubDate().getTime())); } if (entry.getModDate() != null) { newEntry.setModified(String.valueOf(entry.getModDate().getTime())); } for (ActionType action : newEntry.getActions().getAction()) { if ("comments".equals(action.getType())) { newEntry.setRelevance(action.getCant()); } } if (!json) { newEntry.setVerbatim(gson.toJson(newEntry)); } newsList.add(newEntry); // addToMap(parseResult, feed, feedLink, entry, content, newEntry); } } PostsType news; news = new PostsType(newsList); completeLinks(news); Feed2XML(news, sw); return sw.toString(); // + comments.toString(); } else { for (int i = 0; i < feed.getItemCount(); i++) { FeedItem entry = feed.getItem(i); Logger.getLogger(this.getClass().getName()) .log( Level.INFO, "Intentando conectar a {0}", new Object[] {entry.getLink().toString()}); Connection conn = Jsoup.connect(entry.getLink().toString()); conn.timeout(60000); doc = conn.get(); String responseURL = conn.response().url().getHost(); // Logger.getLogger(this.getClass().getName()).log(Level.INFO, "RESPONSE URL: // {0}", responseURL); // doc = Jsoup.connect(entry.getLink().toString()).timeout(60000).get(); Logger.getLogger(this.getClass().getName()) .log(Level.INFO, "Parseando la URL: {0}", new Object[] {entry.getLink().toString()}); feedSelectors = dao.retrieve(url); if (findWords( entry.getTitle(), doc, (ArrayList) params.get("searchlist"), (ArrayList) params.get("blacklist"), feedSelectors)) { newEntrySolr = new Post(); String source; if (feed.getHeader() == null || feed.getHeader().getLink() == null) { Logger.getLogger(this.getClass().getName()).log(Level.INFO, "NULL: Link"); source = feedURL.getHost(); } else { Logger.getLogger(this.getClass().getName()) .log(Level.INFO, "NO NULL: {0}", feed.getHeader().getLink().toString()); source = feed.getHeader().getLink().getHost(); // if (source.indexOf("/") != -1) { // source = source.substring(0, source.indexOf("/") + 1); // } } newEntrySolr.setSource(source); newEntrySolr.setDocType("post"); newEntrySolr.setZone( new Zone( String.valueOf(zone.getId()), zone.getName(), zone.getType().getName(), zone.getExtendedString())); newEntrySolr.setPostLatitude(Double.parseDouble((String) params.get("latitud"))); newEntrySolr.setPostLongitude(Double.parseDouble((String) params.get("longitud"))); // newEntry.setId(entry.getUri()); // newEntry.setId(entry.getUri() != null && entry.getUri().length() > 0 ? // entry.getUri().trim() : entry.getLink().trim()+entry.getTitle().trim()); newEntrySolr.setId( entry.getGUID() != null ? entry.getGUID() : (entry.getElementValue("http://www.w3.org/2005/Atom", "id") != null ? entry.getElementValue("http://www.w3.org/2005/Atom", "id") : (entry.getTitle()))); newEntrySolr.setFromUser( new User( null, source, null, null, place != null ? new org.zonales.entities.Place( String.valueOf(place.getId()), place.getName(), place.getType().getName()) : null)); newEntrySolr.setTitle(entry.getTitle()); newEntrySolr.setText(entry.getDescriptionAsText()); newEntrySolr.setTags(new ArrayList<String>((ArrayList) params.get("tagslist"))); if (newEntrySolr.getLinks() == null) { newEntrySolr.setLinks(new ArrayList<LinkType>()); } if ((links = getLinks(feedSelectors, doc, responseURL)) != null) { newEntrySolr.getLinks().addAll(links); } newEntrySolr.getLinks().add(new LinkType("source", entry.getLink().toString())); if (newEntrySolr.getActions() == null) { newEntrySolr.setActions(new ArrayList<ActionType>()); } newEntrySolr .getActions() .addAll( getActions( feedSelectors, doc, newEntrySolr.getId(), json, (Boolean) params.get("comments"), source)); if (entry.getPubDate() != null) { newEntrySolr.setCreated((entry.getPubDate().getTime())); } if (entry.getModDate() != null) { newEntrySolr.setModified((entry.getModDate().getTime())); } for (ActionType action : newEntrySolr.getActions()) { if ("comments".equals(action.getType())) { newEntrySolr.setRelevance(action.getCant()); } } if (!json) { newEntrySolr.setVerbatim(gson.toJson(newEntrySolr)); } newsListSolr.add(newEntrySolr); // addToMap(parseResult, feed, feedLink, entry, content, newEntry); } } return "{post: " + gson.toJson(newsListSolr) + "}"; // + comments.toString(); } }