public void getNewsInfo(String NewsUrl) { // 获得新闻来源URL try { System.out.println(NewsUrl); Document Doc = Jsoup.connect(NewsUrl).userAgent("Mozilla").cookie("auth", "token").timeout(3000).get(); Element textDIV = Doc.select("div[style=height:800px; overflow-y:scroll; width:100%;]").first(); Element TitleEle = textDIV.select("strong").first(); String Title = TitleEle.text(); // 获得文章title String PublishTime = getDate(NewsUrl); // 获得文章发表日期 Elements ContentPTags = textDIV.select("div[id=ozoom]").select("p"); String Content = "\r\n"; // 获得文章正文内容 for (Element ContentPTag : ContentPTags) { Content += ContentPTag.text() + "\r\n"; } List<String> IMGList = new ArrayList<String>(); // 获得图片地址列表 Elements IMGs = textDIV.select("td[align=center]").select("img[src]"); for (Element IMG : IMGs) { IMGList.add(IMG.attr("abs:src")); } savexml.format.source = NewsUrl; savexml.format.title = Title; savexml.format.publishtime = PublishTime; savexml.format.body = Content; savexml.format.img = IMGList; savexml.save(); } catch (Exception e) { e.printStackTrace(); } }
private List<ArtifactVersionBean> parseMavenMetadata(Document doc) { String groupId = doc.getElementsByTag("groupId").text(); String artifactId = doc.getElementsByTag("artifactId").text(); if (!StringUtils.hasText(groupId) || !StringUtils.hasText(artifactId)) { return Lists.newArrayListWithCapacity(0); } Elements versions = doc.getElementsByTag("version"); List<ArtifactVersionBean> artifactList = Lists.newArrayList(); for (Element version : versions) { ArtifactVersionBean artifactVersionBean = new ArtifactVersionBean(); artifactVersionBean.setGroupId(groupId); artifactVersionBean.setArtifactId(artifactId); artifactVersionBean.setVersion(version.text()); artifactVersionBean.setId(groupId + ":" + artifactId + ":" + version.text()); // Gets and convert the last update date Long lastUpdateDate = retrieveLastUpdateDate(artifactVersionBean); if (lastUpdateDate == null) { continue; } artifactVersionBean.setTimestamp(lastUpdateDate); artifactList.add(artifactVersionBean); } return artifactList; }
public List<AreaVO> parseMessage(String text, int pid) { Document doc = Jsoup.parse(text); Element body = doc.body(); List<AreaVO> areas = new ArrayList<AreaVO>(); Elements divs = body.getElementsByClass("subarea"); if (divs.size() > 0) { Element div = divs.get(0); Elements childs = div.children(); String letter = ""; for (int i = 1; i < childs.size(); i++) { Element child = childs.get(i); if ("b".equals(child.tagName())) { letter = child.text(); continue; } if ("a".equals(child.tagName())) { AreaVO area = new AreaVO(); area.setLetter(letter); area.setName(child.text()); area.setOrderIdx(index); area.setPid(pid); String href = child.attr("href"); String pinyin = href.substring(7, href.lastIndexOf("/")); area.setPinyin(pinyin); index++; System.out.println(area.toString()); areas.add(area); } } } return areas; }
public static Pupil getSelectedPupil(Document doc) throws ParseException { boolean found = false; Pupil p, selectedP = null; Elements pupilSelectors = doc.getElementsByAttributeValue("id", "ctl00_topMenu_pupil_drdPupils"); for (Element pupilSelector : pupilSelectors) { Elements pupils = pupilSelector.getAllElements(); for (Element pupil : pupils) { if (pupil.tagName().equals("option")) { String value = pupil.attr("value"); found = true; if ((p = Pupil.getByFormId(value)) == null) { p = new Pupil(pupil.text(), value); long rowId = p.insert(); if (BuildConfig.DEBUG) Log.d("GshisHTMLParser", TS.get() + " Pupil.insert() = " + rowId); } if (pupil.hasAttr("selected") && pupil.attr("selected").equals("selected")) { selectedP = p; } } } } if (!found) { if (BuildConfig.DEBUG) Log.d("GshisParser", TS.get() + " Alternative fields found!"); Element userName = doc.getElementsByClass("user-name").first(); Element userId = doc.getElementsByAttributeValue("id", "ctl00_topMenu_tbUserId").first(); String name = userName.text(); String id = userId.attr("value"); if (BuildConfig.DEBUG) Log.d("GshisParser", TS.get() + " name=" + name + " id=" + id); if ((p = Pupil.getByFormId(id)) == null) { p = new Pupil(name, id); long rowId = p.insert(); if (BuildConfig.DEBUG) Log.d("GshisParser", TS.get() + " Pupil.insert() = " + rowId); } selectedP = p; } if (selectedP == null) throw new ParseException("Pupils not found", 0); return selectedP; }
/** * This method sets the Y1 position of the table. This is the highest pixel in the table (with the * lowest Y1 score). */ private void setMaxY1() { Element lastSpan = null; String[] positions; String pos; int lastX2 = 0; for (Element span : spans) { try { pos = span.attr("title"); positions = pos.split("\\s+"); int x1 = Integer.parseInt(positions[1]); int x2 = Integer.parseInt(positions[3]); int y1 = Integer.parseInt(positions[2]); if (!(x1 >= lastX2)) { break; } name = name + span.text() + " "; if (y1 > maxY1) { this.maxY1 = y1; } lastX2 = x2; lastSpan = span; } catch (IndexOutOfBoundsException e) { System.out.println("This table got a weird name it raised the following error: "); if (lastSpan != null) { System.out.println(lastSpan.text()); } System.out.println(e); } } }
@Override protected RemoteDetectionResult detectRemoteRepository( final ScrapeContext context, final Page page) { // cheap checks first, to quickly eliminate target without doing any remote requests if (page.getHttpResponse().getStatusLine().getStatusCode() == 200) { final Elements elements = page.getDocument().getElementsByTag("a"); if (!elements.isEmpty()) { // get "template" parent link final Element templateParentLink = getParentDirectoryElement(page); // get the page parent link (note: usually it's 1st elem, but HTTPD for example has extra // links for // column // sorting for (Element element : elements) { // if text is same and abs URLs points to same place, we got it if (templateParentLink.text().equals(element.text()) && templateParentLink.absUrl("href").equals(element.absUrl("href"))) { return new RemoteDetectionResult( RemoteDetectionOutcome.RECOGNIZED_SHOULD_BE_SCRAPED, getTargetedServer(), "Remote is a generated index page of " + getTargetedServer()); } } } } // um, we were not totally positive, this might be some web server with index page similar to // Nexus one return new RemoteDetectionResult( RemoteDetectionOutcome.UNRECOGNIZED, getTargetedServer(), "Remote is not a generated index page of " + getTargetedServer()); }
/** * 解析首页 获取首页的所有链接 * * @param html 首页html文本 * @return Map<String,String> 链接子集 * @throws Exception */ public static Map<String, String> parseIndexHtml(String html) throws Exception { Map<String, String> urlMap = new HashMap<String, String>(); Document doc = Jsoup.parse(html); Elements links = doc.select("a[href]"); Element content = doc.getElementById("xhxm"); Element tonggous = doc.getElementById("xsrs"); tonggou = tonggous.text(); // 通告内容 String studentname = content.text().substring(0, content.text().lastIndexOf("同")); // 郭灶鹏同学 String replacename = URLEncoder.encode(studentname, "gb2312"); // 转成gb2312编码 for (Element link : links) { String linkHref = link.attr("href"); String linkText = link.text(); if ("退出".equals(linkText)) { continue; } if ("#".equals(linkHref) || "#a".equals(linkHref)) { continue; } else { String truelinkHref = linkHref.replaceAll(studentname, replacename); // 真正的url System.out.println(linkText + ":" + truelinkHref); urlMap.put(linkText, truelinkHref); } } return urlMap; }
@BeforeClass public static void setUp() { File input = new File("src/test/java/org/jenkinsci/plugins/marketfeaturereport/market_features.html"); Document doc = null; try { doc = Jsoup.parse(input, "UTF-8"); } catch (IOException e) { e.printStackTrace(); } assert doc != null; Element content = doc.getElementById("market-feature-header"); Elements header = content.getElementsByClass("rTableHead"); Elements failedHeader = content.getElementsByClass("rTableHeadFailed"); Elements rows = content.getElementsByClass("rTableCell"); Elements rows_failed = content.getElementsByClass("rTableCellFailed"); int count_failed = 0, count = 0; for (Element element : header) { summary_table.put(element.text(), rows.get(count).text()); ++count; } Elements link_error = content.getElementsByTag("a"); for (Element element : failedHeader) { summary_table.put(element.text(), rows_failed.get(count_failed).text()); String linkHref = link_error.get(count_failed).attr("href"); summary_error_table.put(element.text(), linkHref); ++count_failed; } }
@Override public Object parseHtml2Obj(String html) { Document doc = Jsoup.parse(html); Element title = doc.getElementById("activity-name"); Element createtime = doc.getElementById("post-date"); // Element from = doc.getElementById("post-user"); Element content = doc.getElementById("essay-body"); Elements pic = doc.select("#media img"); Elements _intro = doc.select(".text p"); String intro = null; if (_intro.isEmpty()) { intro = "阅读全部"; } else { intro = _intro.first().text(); } // List<ArticleObj> objs = new ArrayList<ArticleObj>(); ArticleObj obj = new ArticleObj(); obj.setFrom(account_desc); obj.setContent(content.html()); obj.setCreatetime(createtime.text()); obj.setTitle(title.text()); obj.setIntro(intro.substring(0, intro.length() > 50 ? 50 : intro.length()) + "..."); if (!pic.isEmpty()) { String src = pic.get(0).attr("src"); obj.setPic(getSrc(src)); } System.err.println(obj.getPic()); dbRobot.AddArticleData(obj); cur_count++; return null; }
//// COMPLETAMENTE INUTILE public static int[] getPrice(String path) { int[] month = new int[31]; int count = 0; try { File input = new File(path); Document doc = Jsoup.parse(input, "UTF-8", "http://example.com/"); Elements elementi_div = doc.getElementsByTag("div"); for (Element e : elementi_div) { if (e.text().length() > 0) if (Character.isDigit(e.text().charAt(0)) && e.text().contains("€ ")) { count++; String[] arr = e.text().split(" "); month[Integer.parseInt(arr[0]) - 1] = Integer.parseInt(arr[2].replace(".", "")); } } } catch (Exception e) { System.out.println(e); } if (count == 0) { System.out.println("Non e' stato scaricato il file"); // getPrice(path); } return month; }
@Override public String getContentText(Document document) { String ret = ""; // System.out.println(this.getClass().getName()); // Element e = document.getElementById("video_tags"); // if(e != null){ // ret = ret + " " + e.text().toString(); // } for (Element e : document.getElementsByAttributeValue("class", "starPicTxt")) { ret = ret + " " + e.text().toString(); } for (Element e : document.getElementsByAttributeValue("class", "dlTxt clearfix")) { ret = ret + " " + e.text().toString(); } for (Element e : document.getElementsByAttributeValue("class", "v-star-info")) { ret = ret + " " + e.text().toString(); } // for(Element e : document.getElementsByAttributeValue("class", "listCon")){ // ret = ret + " " + e.text().toString(); // } if (ret.isEmpty() == false) { ret = ret.substring(1); } return ret; }
@Override protected Boolean doInBackground(String... mess) { try { Document page = Jsoup.connect("http://messmenu.snu.in/messMenu.php").get(); Element menu; if (mess[0].equals("dh1")) menu = page.getElementsByTag("tbody").get(0); else menu = page.getElementsByTag("tbody").get(1); Elements breakfast_items = menu.getElementsByTag("td").get(1).children(); Elements lunch_items = menu.getElementsByTag("td").get(2).children(); Elements dinner_items = menu.getElementsByTag("td").get(3).children(); for (Element item : breakfast_items) breakfast.add(item.text()); for (Element item : lunch_items) lunch.add(item.text()); for (Element item : dinner_items) dinner.add(item.text()); return true; } catch (IOException | IndexOutOfBoundsException e) { e.printStackTrace(); } return false; }
public static String getType(Document doc) { String type = ""; if (doc.select("#kw").size() > 0) { Element e = doc.select("#kw").get(0); StringBuilder typeURL = new StringBuilder(); typeURL.append("http://widget.unistats.ac.uk/Widget/"); typeURL.append(e.attr("data-institution") + "/"); typeURL.append(e.attr("data-course") + "/"); typeURL.append(e.attr("data-orientation") + "/"); typeURL.append("null/"); typeURL.append(e.attr("data-language") + "/"); typeURL.append(e.attr("data-kismode")); boolean finishe = false; try { do { Connection tmpConn = Jsoup.connect(typeURL.toString()); Document tmpDoc = tmpConn.timeout(10000).get(); if (tmpDoc.select("#kisWidget > div.widgetCourse > h1").size() > 0) { e = tmpDoc.select("#kisWidget > div.widgetCourse > h1").get(0); type = e.text().trim().indexOf(" ") > 0 ? e.text().trim().substring(0, e.text().trim().indexOf(" ")) : e.text().trim(); } finishe = true; } while (!finishe); } catch (Exception ex) { ex.printStackTrace(); } } return type; }
public HashMap<String, String> initialBestBuyScan(Document doc, String url) { doc = jsoupConnect(url); HashMap<String, String> matchingItems = new HashMap<String, String>(); matchingItems.put("price", doc.select(".medium-item-price").text()); matchingItems.put( "modelNumber", doc.select(".list-item-info .sku-model ul .model-number").text()); matchingItems.put("title", doc.select(".list-item-info .sku-title h4 a").text()); String newURL = "http://bestbuy.com" + bestBuySpecsFormatter(doc.select(".list-item-info .sku-title h4 a").attr("href")); System.out.println(newURL); doc = jsoupConnect(newURL); Elements tableEles = doc.select("#full-specifications table tbody tr"); for (Element ele : tableEles) { if (ele.text().contains("UPC")) { matchingItems.put("upc", ele.text().replace("UPC ", "")); break; } } if (tableEles.size() < 1) matchingItems.put("GoodSKU", "false"); else matchingItems.put("GoodSKU", "true"); doc.empty(); return matchingItems; }
public List<Arrival> busTimetable(final Arrival arrival) throws Exception { final Calendar now = Calendar.getInstance(Locale.UK); final Uri url = Uri.parse("http://transportapi.com") .buildUpon() .path( String.format( "v3/uk/bus/route/%s/%s/inbound/%s/%s/%s/timetable", arrival.bus.operator, arrival.bus.route, arrival.stop.atcocode, dateFormat.format(now.getTime()), timeFormat.format(now.getTime()))) .appendQueryParameter("api_key", apiKey) .appendQueryParameter("app_id", appId) .appendQueryParameter("group", "no") .build(); Log.d("JSON API", String.format("Requesting %s", url)); final HttpResponse response = http.execute(new HttpGet(url.toString())); final StatusLine status = response.getStatusLine(); if (status.getStatusCode() != HttpStatus.SC_OK) { response.getEntity().getContent().close(); throw new IOException(status.getReasonPhrase()); } final Document doc = Jsoup.parse(EntityUtils.toString(response.getEntity()), url.toString()); final Element stopList = doc.getElementsByClass("busroutelist").first(); final Elements stopListItems = stopList.getElementsByTag("li"); ArrayList<Arrival> result = new ArrayList<Arrival>(); for (Element stopListItem : stopListItems) { String destcode; String destname; Time desttime; Element timeElement = stopListItem.getElementsByClass("routelist-time").first(); desttime = parseSimpleTime(timeElement.text().substring(0, 5)); Element destElement = stopListItem.getElementsByClass("routelist-destination").first(); String href = destElement.getElementsByTag("a").first().attr("href"); destcode = href; if (destcode.startsWith("/v3/uk/bus/stop/")) { destcode = destcode.substring("/v3/uk/bus/stop/".length()); } if (destcode.indexOf('/') > 0) { destcode = destcode.substring(0, destcode.indexOf('/')); } destname = destElement.text(); result.add(new Arrival(arrival.bus, new Stop(destcode, destname), desttime)); } return result; }
@Override protected List<Tome> parseTomes(Document htmlDocument, Serie parent) { Date today = new Date(); List<Tome> tomes = new LinkedList<>(); Elements divChapters = htmlDocument.select("div.detail_list"); if (!divChapters.isEmpty()) { Elements spansLeft = divChapters.first().select("span.left"); if (!spansLeft.isEmpty()) { for (Element span : spansLeft) { Elements tomeNumberElements = span.select("span.mr6"); final String tomeNumberString = StringUtils.substringAfter(tomeNumberElements.first().text(), "Vol "); int tomeNumber = 0; if (tomeNumberString != null && !tomeNumberString.isEmpty()) { Integer.parseInt(tomeNumberString); } Tome foundTome = null; for (Tome tome : tomes) { if (tomeNumber == tome.getNumber()) { foundTome = tome; break; } } if (foundTome == null) { Tome tome = new Tome(); tome.setNumber(tomeNumber); tome.setName("Tome " + tomeNumber); tome.setMustBeSaved(true); tome.setValidityDate(today); tome.setSerie(parent); tomes.add(tome); foundTome = tome; } Element link = span.select("a").first(); Chapter chapter = new Chapter(); chapter.setMustBeSaved(true); chapter.setUrl(link.attr("href")); String chapterNumberToParse = link.text(); String tempNumber = StringUtils.substringAfterLast(chapterNumberToParse, " "); chapter.setNumber(Float.parseFloat(tempNumber)); chapter.setName(span.text()); chapter.setTome(foundTome); foundTome.addChapter(chapter); } } } parent.setValidityDate(today); return tomes; }
public static Week getSelectedWeek(Document doc, Schedule s) throws ParseException { boolean found = false; Week selectedW = null; SimpleDateFormat f = new SimpleDateFormat("yyyy dd.MM", Locale.ENGLISH); f.setTimeZone(TimeZone.getTimeZone("Europe/Moscow")); Elements weekSelectors = doc.getElementsByAttributeValue("id", "ctl00_body_week_drdWeeks"); for (Element weekSelector : weekSelectors) { Elements weeks = weekSelector.getAllElements(); for (Element week : weeks) { if (week.tagName().equals("option")) { String value = week.text(); Week w; found = true; if ((w = s.getWeek(week.attr("value"))) == null) { w = new Week(); String wBegin = value.substring(0, value.indexOf("-") - 1); String wMonth = wBegin.substring(wBegin.indexOf(".") + 1, wBegin.length()); String year; if (Integer.parseInt(wMonth) > 7) { year = s.getFormText().substring(0, s.getFormText().indexOf("-") - 1); } else { year = s.getFormText() .substring(s.getFormText().indexOf("-") + 2, s.getFormText().length()); } w.setStart(f.parse(year + " " + wBegin)); w.setFormText(week.text()); w.setFormId(week.attr("value")); s.addWeek(w); } if (week.hasAttr("selected") && week.attr("selected").equals("selected")) { selectedW = w; long u = w.setLoaded().update(); if (BuildConfig.DEBUG) Log.d("GshisHTMLParser", TS.get() + " Week.update() = " + u); } } } } if (!found) throw new ParseException("Weeks not found", 0); return selectedW; }
private void parseFeedItem(String resource) { try { Document doc = Jsoup.parse(resource); Element masthead = doc.select("div.tie-wrapper").first(); Elements feedBoxs = masthead.select("div.tie-box"); for (int i = 0; i < feedBoxs.size(); i++) { FeedItem feedItem = new FeedItem(); Element feedPost = feedBoxs.get(i); Element titleElement = feedPost.select("div.tie-header h2.tie-title a").first(); Element nameElement = feedPost.select("div.tie-content div.tie-user div.user-info p span.user-name").first(); Element sourceElement = feedPost.select("div.tie-content div.tie-user div.user-info p span.user-form").first(); Element timestampElement = feedPost.select("div.tie-content div.tie-user div.user-info p.tie-date").first(); Elements imageElement = feedPost.select("div.tie-content img.st-photo"); Elements contentElements = feedPost.select("div.tie-content p:not(.tie-date):gt(0)"); String title = titleElement.text(); String name = nameElement.text(); String source = sourceElement.text(); String timestamp = timestampElement.text(); String content = ""; for (int j = 0; j < contentElements.size(); j++) { content = content + contentElements.get(j).text() + "\n"; } String image; if (imageElement.attr("src") != "") { image = url + imageElement.attr("src"); } else { image = null; } feedItem.setTitle(title); feedItem.setName(name); feedItem.setPostTime(timestamp); feedItem.setSource(source); feedItem.setImage(image); feedItem.setContent(content); mFeedItems.add(feedItem); } } catch (Exception e) { e.printStackTrace(); } mFeedItemAdapter.notifyDataSetChanged(); }
public Book createBook(Element bookElement, Element bookIndexElement) { Book book = new Book(bookElement.text().trim()); Elements chapterUrlElements = bookIndexElement.select("a"); for (Element link : chapterUrlElements) { String chapterPage = link.attr("href"); int chapterID = Integer.parseInt(link.text()); Chapter chapter = createChapter(chapterID, chapterPage); book.addChapter(chapter); } return book; }
// start setting of list in right rail public void rightRailList( Node listNode, Element rightListEle, Map<String, String> urlMap, String locale) { try { Element title; Element description; Elements headElements = rightListEle.getElementsByTag("h2"); if (headElements.size() > 1) { title = rightListEle.getElementsByTag("h2").last(); description = rightListEle.getElementsByTag("p").last(); sb.append("<li>Mismatch in count of list panel component in right rail.</li>"); } else { title = rightListEle.getElementsByTag("h2").first(); description = rightListEle.getElementsByTag("p").first(); } listNode.setProperty("title", title.text()); javax.jcr.Node introNode = listNode.getNode("intro"); introNode.setProperty("paragraph_rte", description.text()); javax.jcr.Node eleListNode = listNode.getNode("element_list_0"); Elements ulList = rightListEle.getElementsByTag("ul"); for (Element element : ulList) { java.util.List<String> list = new ArrayList<String>(); Elements menuLiList = element.getElementsByTag("li"); for (Element li : menuLiList) { JSONObject jsonObjrr = new JSONObject(); Element listItemAnchor = li.getElementsByTag("a").first(); String anchorText = listItemAnchor != null ? listItemAnchor.text() : ""; String anchorHref = listItemAnchor.absUrl("href"); if (StringUtil.isBlank(anchorHref)) { anchorHref = listItemAnchor.attr("href"); } // Start extracting valid href log.debug("Before right list LinkUrl" + anchorHref + "\n"); anchorHref = FrameworkUtils.getLocaleReference(anchorHref, urlMap, locale, sb); log.debug("after right list LinkUrl" + anchorHref + "\n"); // End extracting valid href jsonObjrr.put("linktext", anchorText); jsonObjrr.put("linkurl", anchorHref); jsonObjrr.put("icon", "none"); jsonObjrr.put("size", ""); jsonObjrr.put("description", ""); jsonObjrr.put("openInNewWindow", "false"); list.add(jsonObjrr.toString()); } eleListNode.setProperty("listitems", list.toArray(new String[list.size()])); } log.debug("Updated title, descriptoin and linktext at " + listNode.getPath()); } catch (Exception e) { e.printStackTrace(); } }
/** * Extract content with jsoup maybe later. * * @param doc * @return */ public static List<Item> extractItem(Document doc) { List<Item> itemList = new ArrayList<Item>(); Elements itemRows = doc.select("tr"); Iterator iterator = itemRows.iterator(); while (iterator.hasNext()) { Element element = (Element) iterator.next(); Element titleElement = element.select(".title a").first(); if (titleElement == null) { continue; } String titleStr = titleElement.text().trim(); String urlStr = titleElement.attr("href").trim(); Element comHeadElement = element.select(".comhead").first(); if (comHeadElement == null) { continue; } String comheadStr = comHeadElement.text().trim(); Element pointsElement = element.select("span[id^=score_]").first(); if (pointsElement == null) { continue; } String pointsStr = pointsElement.text(); if (pointsStr == null) { continue; } String[] pointsArr = pointsStr.split(" "); if (pointsArr.length != 2) { continue; } int points = -1; try { points = Integer.parseInt(pointsArr[0]); } catch (NumberFormatException e) { } if (points < 0) { continue; } Element userElement = element.select("a[href^=user]").first(); if (userElement == null) { continue; } String user = userElement.text().trim(); Element dateElement = element.select(".subtext").first(); } return itemList; }
@Override public void process(ResultItems page) { Document doc = (Document) page.getResource(); Elements elements = doc.select("div.txt-list-category-v2"); for (Element item : elements) { String ancestorName = item.select("h3").text(); String ancestorId = item.attr("id"); CategoryEntity ancestor = new CategoryEntity().setName(ancestorName).setSite(SiteName.Taobao).setCode(ancestorId); getLogger().trace(ancestor); page.addItem(ancestor); Elements subElements = item.select("a"); CategoryEntity parent = null; for (Element item3rd : subElements) { if (item3rd.attr("href").isEmpty()) { String name = item3rd.text().trim(); if (name.isEmpty()) { continue; } if (name.toCharArray()[0] == 160) { continue; } parent = new CategoryEntity().setName(name).setSite(SiteName.Taobao).setParent(ancestor); getLogger().trace(parent); page.addItem(parent); } else { String url = item3rd.absUrl("href"); try { url = java.net.URLDecoder.decode(url, "utf-8"); } catch (UnsupportedEncodingException e) { throw new RuntimeException(url, e); } String name = item3rd.text().trim(); if (name.isEmpty()) { continue; } CategoryEntity grand = new CategoryEntity() .setName(name) .setUrl(url) .setSite(SiteName.Taobao) .setParent(parent); if (parent == null) { throw new RuntimeException("no parent of " + grand); } getLogger().trace(grand); page.addItem(grand); } } } }
/** * Parse nasdq page and write in hbase * * @param symbol */ public static void parseUSSymbols(String symbol) { if (!Hbase.getData(symbol).equals("")) { // System.out.println(symbol + " Exists!"); return; } String result = HttpRequest.sendPost( "http://www.nasdaq.com/symbol/" + symbol.toLowerCase() + "/historical", length + "|false|" + symbol); if (result.equals("")) { WriteError(symbol); System.out.println(symbol + " result error"); return; } // System.out.println(result); Document doc = Jsoup.parse(result); JSONArray HistoricalData = new JSONArray(); try { Element body = doc.getElementsByTag("tbody").get(0); // System.out.println(body.toString()); Elements nodes = body.getElementsByTag("tr"); if (nodes.size() == 0) { WriteError(symbol); System.out.println(symbol + " size 0"); return; } // System.out.println(nodes.size()); for (Element node : nodes) { JSONArray DailyData = new JSONArray(); Elements units = node.getElementsByTag("td"); for (Element unit : units) { if (!unit.text().equals("")) { DailyData.put(unit.text()); } } if (DailyData.length() > 0) { HistoricalData.put(DailyData); } } Hbase.addData(symbol, type, HistoricalData.toString()); // System.out.println(symbol + " done"); } catch (Exception e) { if (handleError) { errors.add(symbol); } else { WriteError(symbol); System.out.println(symbol + " parsing error"); } // TODO: handle exception } }
/** * 这里的Integer参数对应AsyncTask中的第一个参数 这里的String返回值对应AsyncTask的第三个参数 * 该方法并不运行在UI线程当中,主要用于异步操作,所有在该方法中不能对UI当中的空间进行设置和修改 但是可以调用publish * Progress方法触发onProgressUpdate对UI进行操作 */ @Override protected String doInBackground(Integer... params) { // 测试看看能不能显示去掉后缀的部分 String str3 = new String(); String total = null; Document doc = null; try { // doc = // Jsoup.connect("http://www.cnblogs.com/zyw-205520/archive/2012/12/20/2826402.html").get(); // Elements ListDiv = doc.getElementsByAttributeValue("class","postBody"); // 武汉天气预报的借口,http://tianqi.xixik.com/city/wuhan/ doc = Jsoup.connect("http://tianqi.xixik.com/city/wuhan/").get(); Elements ListDiv = doc.getElementsByAttributeValue("class", "left"); // System.out.println(ListDiv); Element ListDiv1 = doc.getElementById("left"); // /** * 用来测试,另外一种写法 Elements ListDiv2=doc.getElementsByAttributeValue("class","content"); Element * te=ListDiv2.get(0); //System.out.println(te.text()); 还有一种写法, Elements * ListDiv3=doc.getElementsByAttributeValue("class","fourday"); for (int * i=0;i<ListDiv3.size()-1;i++){ System.out.println(ListDiv3.get(i).child(0).attr("href")); * //System.out.println(ListDiv3.get(i).text()); } */ // 我自己写的方法获得天气信息,这个只包含前三天的天气信息 Elements temp = doc.getElementsByClass("oneweather"); for (Element element : temp) { // str3=element.text()+","+str3; str3 = str3 + "," + element.text(); // System.out.println(element.text()); } // 接下来就是获取下面七天的天气预报 Elements temp1 = doc.getElementsByClass("fourday"); for (Element element : temp1) { // str3=element.text()+","+str3; str3 = str3 + "," + element.text(); // System.out.println(element.text()); } // 可以试试GBK或UTF-8 } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } // return str.toString() ; total = str3; return str3; // return test(); }
public JSONArray toFourDayJSON(String html, String[] labels) { Document doc = Jsoup.parse(html); JSONArray dates = new JSONArray(); try { Elements tables = doc.select("table"); // Log.d("jsoup", "Four day: Parsing html table: " + tables.size()); for (Element table : tables) { Elements rows = table.select("tr"); JSONObject date_item = new JSONObject(); JSONArray row_json = new JSONArray(); for (Element row : rows) { Elements data = row.select("td"); if (!data.isEmpty()) { JSONObject details = new JSONObject(); for (Element dataItem : data) { Elements img = dataItem.select("img"); String img_src = null, label = null; String[] tokens = null; if (img.size() == 0) { label = labels[data.indexOf(dataItem)]; if (label.equals("Time")) { details.put(label, dataItem.text().split("-")[0]); } else { details.put(label, dataItem.text()); } } else { img_src = img.get(0).attr("src"); tokens = img_src.split("/"); // Log.d("jsoup", "img: "+tokens[tokens.length-1]); details.put(labels[data.indexOf(dataItem)], tokens[tokens.length - 1]); } } row_json.put(details); } } date_item.put("data", new JSONArray(row_json.toString())); date_item.put("date", table.previousElementSibling().text()); dates.put(new JSONObject(date_item.toString())); } } catch (JSONException e) { e.printStackTrace(); } return dates; }
/** * 从一个xmltxt中得到当前信息的程序 * * @param element * @throws IllegalAccessException */ public void dealelement(Element element) { Field[] fields = this.getClass().getDeclaredFields(); for (int i = 0; i < fields.length; i++) { Field f = fields[i]; String type = f.getGenericType().toString(); if (type.equals("class java.lang.String")) { Element temp = element.getElementsByTag(f.getName()).first(); if (temp != null) { try { f.set(this, temp.text()); } catch (IllegalAccessException e) { e.printStackTrace(); } } } else if (type.equals("class java.lang.Integer")) { Element temp = element.getElementsByTag(f.getName()).first(); if (temp != null) { int txt = Integer.parseInt(temp.text()); try { f.set(this, txt); } catch (IllegalAccessException e) { e.printStackTrace(); } } } else if (type.equals("class java.lang.Float")) { Element temp = element.getElementsByTag(f.getName()).first(); if (temp != null) { Float txt = Float.parseFloat(temp.text()); try { f.set(this, txt); } catch (IllegalAccessException e) { e.printStackTrace(); } } } else if (type.equals("java.util.List<java.lang.String>")) { Elements temp = element.getElementsByTag(f.getName()); if (temp.size() > 0) { List<String> list = new ArrayList<>(); for (Element ele : temp) { list.add(ele.text()); } try { f.set(this, list); } catch (IllegalAccessException e) { e.printStackTrace(); } } } } }
@Override public void endElement(String uri, String localName, String qName) throws SAXException { if (inChannel) { if (inTitle) { feeds.setTitle(temp); inTitle = false; } else if (inLink) { feeds.setLink(temp); inLink = false; } else if (inDesc) { Document doc = Jsoup.parseBodyFragment(temp); Element body = doc.body(); feeds.setDescription(body.text()); inDesc = false; } else if (inLanguage) { feeds.setLanguage(temp); inLanguage = false; } } else if (inItem) { if (inTitle) { item.setTitle(temp); inTitle = false; } else if (inLink) { item.setLink(temp); inLink = false; } else if (inDesc) { Document doc = Jsoup.parseBodyFragment(temp); Element body = doc.body(); item.setDescription(body.text()); inDesc = false; } else if (inPubdate) { item.setPubdate(temp); inPubdate = false; } else if (inGuid) { item.setGuid(temp); inGuid = false; } } if (qName.equalsIgnoreCase("channel")) { if (feeds != null) { feeds.setItems(itemsList); feedsList.add(feeds); itemsList = new ArrayList<RSSItem>(); } } if (qName.equalsIgnoreCase("item")) { if (item != null) itemsList.add(item); } }
String parseTitle(Element element) { try { if (element.classNames().contains("m-hero__slot")) { Element a = element.getElementsByClass("m-hero__slot-link").first(); Element h2 = a.getElementsByTag("h2").first(); return h2.text(); } else if (element.classNames().contains("m-entry-slot")) { Element h3 = element.getElementsByTag("h3").first(); return h3.text(); } else throw new NullPointerException(); } catch (NullPointerException e) { e.printStackTrace(); return "Unknown title"; } }
/** takes an element and turns the P tags into \n\n */ public String getFormattedText(Element topNode) { removeNodesWithNegativeScores(topNode); StringBuilder sb = new StringBuilder(); append(topNode, sb, nodesToKeepCssSelector); String str = SHelper.innerTrim(sb.toString()); if (str.length() > 100) return str; // no subelements if (str.isEmpty() || !topNode.text().isEmpty() && str.length() <= topNode.ownText().length()) str = topNode.text(); // if jsoup failed to parse the whole html now parse this smaller // snippet again to avoid html tags disturbing our text: return Jsoup.parse(str).text(); }
public URL getPlayerStatsURL() { Elements aElements = getDocument().getElementsByTag("a"); for (Element aElement : aElements) { if (aElement.text() != null && aElement.text().equalsIgnoreCase("player statistics")) { String urlString = "http://www.whoscored.com" + aElement.attr("href"); try { URL url = new URL(urlString); return url; } catch (MalformedURLException e) { logger.error("Malformed URL exception when getting player stats URL: {}.", urlString); } } } return null; }