@Override protected void parseRow( final String query, final int options, final Element tr, final List<Name> results) { final String thumbnailUrl = tr.getElementsByAttributeValue("class", "primary_photo") .first() .getElementsByTag("img") .first() .attr("src"); final Element r = tr.getElementsByAttributeValue("class", "result_text").first(); final Element a = r.getElementsByTag("a").first(); final String url = Imdb.BASE_URL + a.attr("href"); final String name = a.ownText(); String job = ""; Reference ref = null; final Elements smalls = r.getElementsByTag("small"); if (!smalls.isEmpty()) { final String refUrl = Imdb.BASE_URL + smalls.first().getElementsByTag("a").first().attr("href"); String desc = smalls.first().text(); if (desc.startsWith("(") && desc.endsWith(")")) desc = desc.substring(1, desc.length() - 1); final int comma = desc.indexOf(','); if (comma != -1) { job = desc.substring(0, comma).trim(); ref = new Reference(refUrl, desc.substring(comma + 1).trim()); } else { if (desc.matches(".+\\(\\d+\\)")) ref = new Reference(refUrl, desc.substring(comma + 1).trim()); else job = desc; } } results.add(new Name(url, thumbnailUrl, name, job, ref)); }
private int getCount(Element ele, String dataElementTerm) throws NumberFormatException { Elements allElements = ele.getElementsByAttributeValue("data-element-term", dataElementTerm); if (allElements != null && allElements.size() > 0) { Element target = allElements.first(); String count = target.child(0).ownText(); count = count.replaceAll(",", ""); int res = Integer.parseInt(count); return res; } return -1; }
/** * 解析引用回复的准备数据 * * @param responseBody * @return */ public static PrepareQuoteReply parsePrepareQuoteReply(String responseBody) { PrepareQuoteReply quoteReply = new PrepareQuoteReply(); try { Document document = Jsoup.parse(responseBody); document.setBaseUri(Constants.BASE_URL); Element postform = document.getElementById("postform"); String url = postform.absUrl("action"); String formhash = postform.getElementsByAttributeValue("name", "formhash").first().attr("value"); String posttime = postform.getElementsByAttributeValue("name", "posttime").first().attr("value"); String noticeauthor = postform.getElementsByAttributeValue("name", "noticeauthor").first().attr("value"); String noticetrimstr = postform.getElementsByAttributeValue("name", "noticetrimstr").first().attr("value"); String noticeauthormsg = postform.getElementsByAttributeValue("name", "noticeauthormsg").first().attr("value"); String reppid = postform.getElementsByAttributeValue("name", "reppid").first().attr("value"); String reppost = postform.getElementsByAttributeValue("name", "reppost").first().attr("value"); String quoteBody = postform.getElementsByTag("blockquote").first().toString(); quoteReply.setNoticeauthor(noticeauthor); quoteReply.setNoticeauthormsg(noticeauthormsg); quoteReply.setNoticetrimstr(noticetrimstr); quoteReply.setPosttime(posttime); quoteReply.setQuoteBody(quoteBody); quoteReply.setReppid(reppid); quoteReply.setUrl(url); quoteReply.setFormhash(formhash); quoteReply.setReppost(reppost); } catch (Exception e) { e.printStackTrace(); } return quoteReply; }
public static List<Movie> getIntroduction(String url) { List<Movie> movieList = new ArrayList<Movie>(); Document doc = null; try { doc = Jsoup.connect("http://movie.douban.com/").timeout(5000).get(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } if (doc == null) { try { doc = Jsoup.connect("http://movie.douban.com/").timeout(5000).get(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } if (doc == null) { return null; } Elements mainUL = doc.select("ul#newcontent1"); Elements elem = mainUL.get(0).getElementsByTag("li"); for (int i = 0; i < elem.size(); i++) { // Log.i("li"+i, elem.get(i).html()); Movie m = new Movie(); Element tempElement = elem.get(i); Elements imgDiv = tempElement.getElementsByAttributeValue("class", "img"); m.setImgHtml(imgDiv.get(0).html()); // Log.i("img_url:", imgDiv.get(0).html()); Elements elems = imgDiv.get(0).children(); String imgURL = null; imgURL = elems .get(0) .html() .substring( elems.get(0).html().indexOf("\"") + 1, elems.get(0).html().lastIndexOf("\"")); // Log.d("Movie News", imgURL); m.setImgUrl(imgURL); // get the movie's index at movie.douban.com String doubanUrl = imgDiv.get(0).html(); int start = doubanUrl.indexOf("href=\"") + 6; int end = start; while (doubanUrl.charAt(++end) != '\"') ; doubanUrl = doubanUrl.substring(start, end); m.setDoubanUrl(doubanUrl); String id = doubanUrl.substring(doubanUrl.indexOf("subject/") + 8, doubanUrl.length() - 1); m.setID(Integer.parseInt(id)); // Log.e(doubanUrl, id); // Log.e("Movie News", doubanUrl); Elements introDiv = tempElement.getElementsByAttributeValue("class", "intro"); // save the rating node to extract this movie's rating Node rating = introDiv.get(0).child(1); List<Node> nodeList = rating.childNodes(); // rank of the movie instance will be set to the number from the extracted node m.setDoubanRank(nodeList.get(2).toString().trim()); // remove the rating node from the parent introDiv.get(0).child(1).remove(); // remove the number of current movie introDiv.get(0).childNodes().get(1).childNode(0).remove(); // get the movie's name List<Node> childList = introDiv.get(0).childNodes().get(1).childNodes().get(0).childNodes(); String title = childList.get(0).toString(); if (title.contains("·")) { Log.e("Movie News", "tag found"); title = title.replace("·", "."); } m.setMovieName(title); m.setInfoHtml(introDiv.get(0).html()); movieList.add(m); } // System.out.println("working"); return movieList; }
public static ArrayList<RequestInfo> parseApiDoc(String path) { File file = new File(path); String response = FileUtils.readToString(file, "UTF-8"); Document parse = Jsoup.parse(response); // 接口类型总称 api-Search String mainName = parse.getElementsByTag("section").get(0).attr("id"); // System.out.println("-> type = " + mainName); // System.out.println(); ArrayList<RequestInfo> requestInfos = new ArrayList<RequestInfo>(); // 全部类型接口 for (Element e : parse.getAllElements()) { String attrId = e.attr("id"); // div的标签,且id名前缀为mainName // 为类型下单个接口 api-Search-PostSearchCompany if (e.tagName().equals("div") && attrId.startsWith(mainName)) { RequestInfo requestInfo = new RequestInfo(); // System.out.println("---> name = " + attrId); requestInfo.setName(attrId); // method post/get String method = e.getElementsByTag("pre").get(0).attr("data-type"); // System.out.println("-----> method = " + method); requestInfo.setMethod(method); // url String url = e.getElementsByAttributeValue("class", "pln").get(0).text(); // System.out.println("-----> url = " + url); requestInfo.setUrl(url); // des String des = e.getElementsByAttributeValue("class", "pull-left").get(0).text(); // System.out.println("-----> des = " + des); requestInfo.setDes(des); // post params Element ePostParams = e.getElementsByTag("table").get(0); ArrayList<RequestParam> params = new ArrayList<RequestParam>(); for (Element ePostParam : ePostParams.getElementsByTag("tr")) { // param 字段 Elements eColumn = ePostParam.getElementsByTag("td"); if (eColumn.size() == 0) { continue; } // 标签"选项" // String label = ePostParam.getElementsByAttributeValue("class", "label // label-optional") // .get(0).text(); String label = "选项"; // 第一个字段为参数名 String paramName = eColumn.get(0).text(); // 去除标签 paramName = paramName.replace(label, "").trim(); // 第二个字段为参数类型 // 可能类型为 String Number Float String paramType = eColumn.get(1).text(); // 第三个字段为参数描述 String paramDes = eColumn.get(2).text(); // System.out.println("-----> param = " + paramName + " ... " // + paramType + " ... " + paramDes); RequestParam param = new RequestParam(paramName, paramType, paramDes); params.add(param); } requestInfo.setParams(params); requestInfos.add(requestInfo); // System.out.println(); } } return requestInfos; }
public static void getLessons(Document doc, Schedule s) throws ParseException { final SimpleDateFormat format = new SimpleDateFormat("dd.MM.yyyy HH:mm", Locale.ENGLISH); format.setTimeZone(TimeZone.getTimeZone("Europe/Moscow")); Elements lessonCells = doc.getElementsByAttribute("number"); for (Element lessonCell : lessonCells) { Lesson l, lPrev = null; // lPrev to handle duplicate lesson int sameLesson = 0; // Also to handle duplicate lesson int number = Integer.parseInt(lessonCell.attr("number")); String time = ""; Elements timeDetails = lessonCell.getElementsByClass("cell-header2"); for (Element timeDetail : timeDetails) { if (timeDetail.hasAttr("style")) time = timeDetail.text(); } Elements lessonCellDetails = lessonCell.getElementsByAttribute("jsdate"); for (Element lessonCellDetail : lessonCellDetails) { String date = lessonCellDetail.attr("jsdate"); int index = 0; sameLesson = 0; for (Element subject : lessonCellDetail.getElementsByAttributeValue("class", "lesson-subject")) { if (subject == null || subject.text() == null || subject.text().length() <= 0) { // No lesson scheduled continue; } Date start = format.parse(date + " " + time.substring(0, time.indexOf("-") - 1)); if ((l = s.getLessonByNumber(start, number)) == null) { if (BuildConfig.DEBUG) Log.d("GshisHTMLParser", TS.get() + " getLessons() not found in db, will insert"); l = new Lesson(); sameLesson = 0; l.setStart(start); l.setStop( format.parse(date + " " + time.substring(time.indexOf("-") + 2, time.length()))); l.setFormId(subject.attr("id")); l.setFormText(subject.text()); l.setTeacher( lessonCellDetail .getElementsByAttributeValue("class", "lesson-teacher") .get(sameLesson) .text()); l.setNumber(number); s.addLesson(l); } else { if (BuildConfig.DEBUG) Log.d("GshisHTMLParser", TS.get() + " getLessons() found in db, will update"); l.setFormId(subject.attr("id")); if (lPrev != null && lPrev.getStart().equals(start) && lPrev.getNumber() == number) { if (BuildConfig.DEBUG) Log.d( "GshisHTMLParser", TS.get() + " getLessons() dup = " + subject.text() + " index = " + index + " sameLesson = " + sameLesson); sameLesson++; if (!lPrev.getFormText().equals(subject.text())) l.setFormText(fixDuplicateString(subject.text(), lPrev.getFormText(), sameLesson)); String teacher = lessonCellDetail .getElementsByAttributeValue("class", "lesson-teacher") .get(index) .text(); if (!lPrev.getTeacher().equals(teacher)) l.setTeacher(fixDuplicateString(teacher, lPrev.getTeacher(), sameLesson)); } else { l.setNumber(number); l.setFormText(subject.text()); l.setTeacher( lessonCellDetail .getElementsByAttributeValue("class", "lesson-teacher") .get(index) .text()); } l.update(); } lPrev = l; index++; } } } }