private void initPane() { // WebEngine engine = optionView.getEngine(); try { Document document = Jsoup.connect(webView.getEngine().getLocation()).get(); Element table = document.select("#normal_basket_" + document.select("[name=item_id]").val()).first(); Element td = table.select("td").first(); Elements spans = td.select("span"); Elements selects = td.select("select"); // System.out.println(spans.size()); cmb = new ArrayList<ComboBox>(); for (int i = 0; i < spans.size(); i++) { ObservableList<ValuePair> obs = FXCollections.observableArrayList(); Elements options = selects.get(i).select("option"); for (int k = 0; k < options.size(); k++) { Element option = options.get(k); obs.add(new ValuePair("choice", option.text(), option.val())); } cmb.add(new ComboBox<ValuePair>(obs)); optionArea.getChildren().addAll(new Text(spans.get(i).text()), cmb.get(i)); } } catch (Exception e) { // TODO 自動生成された catch ブロック e.printStackTrace(); } }
public List<MenuMeal> getMenuMeals(int number) { Document doc = null; List<MenuMeal> meals = new ArrayList<>(); try { doc = Jsoup.connect(String.format(URL, number)) .userAgent("Chrome/49.0.2623.112") .referrer("https://www.google.ru/") .timeout(7000) .get(); } catch (IOException e) { e.printStackTrace(); } if (doc == null) return meals; Elements elements = doc.select("td[width=400"); if (!elements.isEmpty()) { for (Element element : elements) { Element parent = element.parent(); MenuMeal menuMeal = new MenuMeal(); menuMeal.setDescription(parent.select("div[id=ssilka]").first().text()); String cost = parent.select("div[id=ssilka]").last().text(); menuMeal.setCost(Integer.valueOf(cost.substring(0, cost.indexOf("-")))); meals.add(menuMeal); } return meals; } else { return meals; } }
public static boolean isShotbowDonor(String user) throws IOException { Document doc = Jsoup.connect("https://shotbow.net/forum/search").userAgent(WebUtils.USER_AGENT).get(); String xfToken = doc.select("input[name=_xfToken]").val(); RequestSettings request = new RequestSettings(); request.setGzip(false); request.setUrl("https://shotbow.net/forum/search/search"); request.setHost("shotbow.net"); request.setOrigin("https://www.shotbow.net"); request.setReferer("https://shotbow.net/forum/portal/"); request.addParameter("keywords", user); request.addParameter("users", ""); request.addParameter("date", ""); request.addParameter("_xfToken", xfToken); String response = RequestUtils.excuteSpecialPost(request); Document doc1 = Jsoup.parse(response); for (Element e : doc1.select("li.userResult")) { if (e.select("a.username").first().text().equalsIgnoreCase(user)) { return !e.select("div.userTitle").first().text().equalsIgnoreCase("Regular Member"); } } return false; }
/** * 解析数据,默认解析第一列 * * @param rows 源数据集 * @return 节目数据 */ private static String[][] parseRows(Elements rows) { String[][] programs = new String[rows.size()][2]; int rowspan_0 = 0; int rowspan_1 = 0; for (int i = 0; i < rows.size(); i++) { Element row = rows.get(i); try { Elements cells = row.children(); if (rowspan_0 == 0) { Element cell_0 = cells.get(0); rowspan_0 = Integer.valueOf(cell_0.attr("rowspan")); if (rowspan_1 == 0) { Element cell_1 = cells.get(1); rowspan_1 = Integer.valueOf(cell_1.attr("rowspan")); programs[i][0] = DBclass.xmlFilte(cell_1.select("dt").text()); programs[i][1] = DBclass.xmlFilte(cell_1.select("dd").text()); } } else if (rowspan_1 == 0) { Element cell_0 = cells.get(0); rowspan_1 = Integer.valueOf(cell_0.attr("rowspan")); programs[i][0] = DBclass.xmlFilte(cell_0.select("dt").text()); programs[i][1] = DBclass.xmlFilte(cell_0.select("dd").text()); } rowspan_0--; rowspan_1--; } catch (Exception e) { e.printStackTrace(System.out); } } return programs; }
public static void initMajorList(String originalUrl) { System.out.println("preparing majorList"); boolean finish = false; do { try { majorList.clear(); Connection conn = Jsoup.connect(originalUrl); Document doc = conn.timeout(10000).get(); Elements es = doc.select("#accordion__target-3 > div.course-listing__box > a"); for (Element e : es) { // major MajorForCollection major = new MajorForCollection(); major.setLevel(LEVEL); major.setTitle(e.select("h3").get(0).text().trim()); major.setType(e.select("p").get(0).text().replaceAll("-[\\s\\S]*", "").trim()); major.setUrl(e.select("a").get(0).attr("href")); majorList.add(major); } ; finish = true; } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } } while (!finish); System.out.println("majorList prepared"); System.out.println("majorList size: " + majorList.size()); }
public ArrayList<String> searchAmazon( Document doc, String searchTerm, HashMap<String, String> bestBuyInfo) { String url = "http://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords=" + searchTerm; doc = jsoupConnect(url); Elements items = doc.select(".s-item-container"); ArrayList<String> matchingList = new ArrayList<String>(); for (Element ele : items) { // grab name and price String name = ele.select(".a-link-normal").attr("title"); String price = (ele.select("div div div div div div a.a-link-normal span.a-color-price") .text() .split(" ")[0]); double amazonPrice = 0.0; System.out.println(name + "\n" + price); if (price.equals("")) { price = ele.select("span.a-size-base").text().split(" ")[0]; } else if (price.contains("$")) { amazonPrice = Double.parseDouble(price.replace("$", "").replace(",", "")); } if (price.contains("$") && amazonReady(ele.select(".a-link-normal").attr("href"), true)) { matchingList.add(price); System.out.println("yes"); } else System.out.println("no"); } return matchingList; }
@Override public void parsePages(ArrayList<?> tableList, Map.Entry<String, String> entry) { int type = 2; String[] words = entry.getValue().split(";"); String key = entry.getKey().split(";")[0]; String website = "全景社区"; for (Element ele : (ArrayList<Element>) tableList) { String title = ele.select("h3.title").select("a").text(); String time = Subutils.getTime(ele.select("p.meta").last().text()); String summary = ele.select("p.content").text(); String url = ele.select("h3.title").select("a").attr("href"); String content = Page.getContent(url, "div.pcb", "utf-8"); ArrayList<Integer> FNum = new ArrayList<Integer>(); if (Transmition.contentFilter(words, content, key, FNum) && Transmition.timeFilter(time, this.spyHistory, title)) { spyHistory.add(title); Transmition.showDebug(type, title, content, url, time, summary, website, FNum.get(0)); // 调接口~~~~~ Article article = Transmition.getArticle( type, title, content, url, time, summary, website, key, FNum.get(0)); Transmition.transmit(article); } } }
/** Mudah is not standardized, result will be messy if crawl them */ @Override public List<Item> parse(String query, int size) throws IOException { // request for a page Document doc = Jsoup.connect("http://www.mudah.my/li?q=" + query) .userAgent(Constant.HTTP_USER_AGENT) .timeout(Constant.HTTP_TIMEOUT) .get(); Elements listS = doc.select("div.listing_thumbs").first().select("div.list_ads"); ArrayList<Item> result = new ArrayList<Item>(size); for (int i = 0; i < listS.size(); i++) { Element list = listS.get(i); String img = ""; list.select("div.image_thumb"); Elements imgS = list.select("div.image_thumb > a + img"); if (imgS.size() < 0) { // some may not have images img = imgS.first().attr("href"); } Element listE = list.select("li.listing_ads_title").first(); String title = listE.child(0).text(); String url = listE.child(0).attr("href"); String price = listE.text(); price = price.substring(price.lastIndexOf("RM") + 2).trim().replaceAll(" ", ""); int dPrice = Integer.parseInt(price); result.add(new Item("Mudah", title, dPrice, img, url)); } return result; }
@Override public void parsePages(ArrayList<?> tableList, Map.Entry<String, String> entry) { String website = "中国企业新闻"; int type = 4; String[] words = entry.getValue().split(";"); String key = entry.getKey().split(";")[0]; for (Element ele : (ArrayList<Element>) tableList) { String title = ele.select("li.news_title").select("a").text(); String time = FormatTime.getTime(ele.select("li.news_other").text(), "\\d{4}-\\d{2}-\\d{2}"); String summary = ele.select("li.news_content").text(); String url = ele.select("li.news_title").select("a").attr("href"); String content = Page.getAllHtmlContent(url); ArrayList<Integer> FNum = new ArrayList<Integer>(); if (Transmition.contentFilter(words, content, key, FNum) && Transmition.timeFilter(time, this.spyHistory, title)) { spyHistory.add(title); Transmition.showDebug(type, title, content, url, time, summary, website, FNum.get(0)); // 调接口~~~~~ Article article = Transmition.getArticle( type, title, content, url, time, summary, website, key, FNum.get(0)); Transmition.transmit(article); } } }
@Override public SearchResult[] getSearchResults(String searchString) throws IOException { Document doc = Jsoup.connect(searchString).timeout(CONNECTION_TIMEOUT_VALUE).get(); boolean onSearchResultsPage = doc.location().contains("adultSearch.htm"); // found the movie without a search results page if (doc.location() != null && !onSearchResultsPage) { String idOfPage = getIDStringFromDocumentLocation(doc); String posterPath = getPosterPreviewPathFromIDString(idOfPage); String label = doc.select("title").first().text(); Thumb previewImage = new Thumb(posterPath); // SearchResult directResult = new SearchResult(doc.location()); SearchResult result = null; if (posterPath != null) result = new SearchResult(doc.location(), label, previewImage); else result = new SearchResult(doc.location(), label, null); SearchResult[] directResultArray = {result}; return directResultArray; } Elements foundMovies = doc.select("table[width=690]:contains(Wish List) tr tbody:has(img)"); LinkedList<SearchResult> searchList = new LinkedList<SearchResult>(); for (Element movie : foundMovies) { String urlPath = movie.select("a").first().attr("href"); String thumb = movie.select("img").first().attr("src"); String label = movie.select("img").first().attr("alt"); SearchResult searchResult = new SearchResult(urlPath, label, new Thumb(thumb)); if (!searchList.contains(searchResult)) searchList.add(searchResult); } return searchList.toArray(new SearchResult[searchList.size()]); }
private static void crawl() { String url = url_tpl + (page++); Logger.info("正在抓取:%s", url); if (StringUtils.isBlank(url)) return; sleep(); Document doc = Jsoup.parse(WS.url(url).get().body, url); Elements elements = doc.select(".video-item"); if (elements.isEmpty()) return; for (Element element : elements) { try { Element link = element.select(">a").first(); String cover = link.select("img").first().absUrl("src"); String coverTitle = link.select(".v-update").first().html(); String detailUrl = link.absUrl("href"); String name = element.select(".v-desc .v-title a").first().html(); Logger.info("正在抓取名称:%s", name); Movie movie = Movie.find("byName", name).first(); if (movie == null) { movie = new Movie(); movie.id = DBCounter.generateUniqueCounter(Movie.class) + ""; } movie.name = name; movie.cover = cover; movie.cover_title = coverTitle; movie.details = getDetails( movie, "http://video.baidu.com/v?word=" + URLEncoder.encode("美剧 " + name, "GBK")); movie.save(); } catch (Exception e) { Logger.error(e.getMessage(), e); } } crawl(); }
private String getUpdatedFileContent(List<Vacancy> vacancies) { Document document = null; try { document = getDocument(); document.html(); Element template = document.select("[class=vacancy template]").first(); Element templateCopy = template.clone(); templateCopy.removeAttr("style"); templateCopy.removeAttr("class"); templateCopy.addClass("vacancy"); document.select("tr[class=vacancy]").remove(); for (Vacancy vacancy : vacancies) { Element thisVacancyElement = templateCopy.clone(); thisVacancyElement.select("[class=city]").first().text(vacancy.getCity()); thisVacancyElement.select("[class=companyName]").first().text(vacancy.getCompanyName()); thisVacancyElement.select("[class=salary]").first().text(vacancy.getSalary()); thisVacancyElement .select("[class=title]") .select("a[href]") .first() .text(vacancy.getTitle()); thisVacancyElement .select("[class=title]") .select("a[href]") .first() .attr("href", vacancy.getUrl()); document.select("[class=vacancy template]").first().before(thisVacancyElement.outerHtml()); } } catch (IOException e) { e.printStackTrace(); System.out.println("Some exception occurred"); } return document.html(); }
public void getNewsInfo(String NewsUrl) { // 获得新闻来源URL try { System.out.println(NewsUrl); Document Doc = Jsoup.connect(NewsUrl).userAgent("Mozilla").cookie("auth", "token").timeout(3000).get(); Element textDIV = Doc.select("div[style=height:800px; overflow-y:scroll; width:100%;]").first(); Element TitleEle = textDIV.select("strong").first(); String Title = TitleEle.text(); // 获得文章title String PublishTime = getDate(NewsUrl); // 获得文章发表日期 Elements ContentPTags = textDIV.select("div[id=ozoom]").select("p"); String Content = "\r\n"; // 获得文章正文内容 for (Element ContentPTag : ContentPTags) { Content += ContentPTag.text() + "\r\n"; } List<String> IMGList = new ArrayList<String>(); // 获得图片地址列表 Elements IMGs = textDIV.select("td[align=center]").select("img[src]"); for (Element IMG : IMGs) { IMGList.add(IMG.attr("abs:src")); } savexml.format.source = NewsUrl; savexml.format.title = Title; savexml.format.publishtime = PublishTime; savexml.format.body = Content; savexml.format.img = IMGList; savexml.save(); } catch (Exception e) { e.printStackTrace(); } }
/* * Getting news from "http://enib.net/" */ public List<News> getNews() { Document doc = null; try { doc = Jsoup.connect("http://enib.net/").get(); } catch (IOException e) { System.out.println("Can't load news"); e.printStackTrace(); } /* * Getting name, information, description and add it to the news List */ Elements getter = doc.getElementsByClass("news"); for (Element get : getter) { String news = ""; String name = get.select("h1").text(); String information = get.select("h2").text(); Elements markdown = get.getElementsByClass("markdown"); for (Element paragraph : markdown.select("p")) { news = news + paragraph.text() + System.getProperty("line.separator"); } News n = new News(name, information, news); this.news.add(n); } return this.news; }
@Override protected String doInBackground(String... params) { Document doc = null; try { doc = Jsoup.connect(params[0]) .userAgent( "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.86 Safari/537.36") .timeout(9000) .get(); Elements container = doc.select("div.cell-border-css"); for (Element e : container) { name = e.select("h4.product-title").text(); productUrl = e.select("a.picture").get(0).absUrl("href"); price = e.select("span[itemprop=price]").text(); imgUrl = e.select("meta[itemprop=image]").attr("content"); Log.d("INFO", name); products.add(new Product(name, price, imgUrl, productUrl)); } } catch (IOException e) { e.printStackTrace(); } return null; }
public ArrayList<DataStructure> parseXML(String xmlUrl, String whatForm) throws IOException { System.err.println("Creating an XML database"); File file = new File(xmlUrl); Document document = Jsoup.parse(file, "UTF-8"); String kopuk; ArrayList<DataStructure> xmlData = new ArrayList<>(); Elements linkFrom = document.select("FORM"); for (Element link : linkFrom) { Map<String, Map<String, Map<String, String>>> hashMapGlobal = new HashMap<>(); if (link.hasAttr("KO")) { kopuk = link.attr("KO"); } else { kopuk = link.attr("KOPUK"); } Elements part; if (!document.select("PodPart").isEmpty()) { part = link.select("PodPart"); } else { part = link.select("Part"); } for (Element ichPart : part) { Map<String, Map<String, String>> hashMapPart = new HashMap<>(); Elements rows = ichPart.select("Row"); for (Element ichRow : rows) { Map<String, Map<String, String>> hashMapRow = new HashMap<>(); Map<String, String> hashMapGrahp = new HashMap<>(); Elements graph = ichRow.select("graph"); for (Element ichGraph : graph) { hashMapGrahp.put(ichGraph.attr("seqNum"), ichGraph.text()); } hashMapPart.put(ichRow.attr("seqNum"), hashMapGrahp); hashMapRow.clear(); } if (Objects.equals(whatForm, "242") || Objects.equals(whatForm, "243")) { System.out.println("Find 242 243 "); if (hashMapGlobal.size() > 0) { System.out.println(" hashMapGlobal.size()>0 "); Map<String, Map<String, String>> hashMapRowOld = hashMapGlobal.get("1"); for (String rowKey : hashMapPart.keySet()) { Map<String, String> hashMapPartIN = hashMapPart.get(rowKey); Map<String, String> hashMapPartOldIN = hashMapRowOld.get(rowKey); for (String graphKey : hashMapPartIN.keySet()) { hashMapPartOldIN.put(graphKey, hashMapPartIN.get(graphKey)); } hashMapRowOld.put(rowKey, hashMapPartOldIN); } hashMapGlobal.put("1", hashMapRowOld); } else hashMapGlobal.put(ichPart.attr("seqNum"), hashMapPart); } else { hashMapGlobal.put(ichPart.attr("seqNum"), hashMapPart); } } xmlData.add(new DataStructure(kopuk, hashMapGlobal)); } return xmlData; }
@Override protected List<Tome> parseTomes(Document htmlDocument, Serie parent) { Date today = new Date(); List<Tome> tomes = new LinkedList<>(); Elements divChapters = htmlDocument.select("div.detail_list"); if (!divChapters.isEmpty()) { Elements spansLeft = divChapters.first().select("span.left"); if (!spansLeft.isEmpty()) { for (Element span : spansLeft) { Elements tomeNumberElements = span.select("span.mr6"); final String tomeNumberString = StringUtils.substringAfter(tomeNumberElements.first().text(), "Vol "); int tomeNumber = 0; if (tomeNumberString != null && !tomeNumberString.isEmpty()) { Integer.parseInt(tomeNumberString); } Tome foundTome = null; for (Tome tome : tomes) { if (tomeNumber == tome.getNumber()) { foundTome = tome; break; } } if (foundTome == null) { Tome tome = new Tome(); tome.setNumber(tomeNumber); tome.setName("Tome " + tomeNumber); tome.setMustBeSaved(true); tome.setValidityDate(today); tome.setSerie(parent); tomes.add(tome); foundTome = tome; } Element link = span.select("a").first(); Chapter chapter = new Chapter(); chapter.setMustBeSaved(true); chapter.setUrl(link.attr("href")); String chapterNumberToParse = link.text(); String tempNumber = StringUtils.substringAfterLast(chapterNumberToParse, " "); chapter.setNumber(Float.parseFloat(tempNumber)); chapter.setName(span.text()); chapter.setTome(foundTome); foundTome.addChapter(chapter); } } } parent.setValidityDate(today); return tomes; }
private Post parsePost(Element element) { Post post = new Post(); post.setId(Long.parseLong(element.dataset().get("story-id"))); post.setTitle(element.select(titleSelector).text()); post.setContent(element.select(contentSelector).text()); post.setDescription(element.select(descriptionSelector).text()); post.setUrls(findUrls(post)); return post; }
private ViewModel parseDetail(Document doc, ViewModel item) { if (doc.select("select#SeasonSelection").size() > 0) { item.setType(ViewModel.Type.SERIES); String rel = doc.select("select#SeasonSelection").attr("rel"); rel = rel.substring(rel.indexOf("SeriesID=") + "SeriesID=".length()); item.setSeriesID(Integer.valueOf(rel)); // Fill seasons and episodes Elements seasons = doc.select("select#SeasonSelection > option"); List<Season> list = new ArrayList<Season>(); for (Element season : seasons) { String[] rels = season.attr("rel").split(","); Season s = new Season(); s.id = Integer.valueOf(season.val()); s.name = season.text(); s.episodes = rels; list.add(s); } item.setSeasons(list.toArray(new Season[list.size()])); } else { item.setType(ViewModel.Type.MOVIE); List<Host> hostlist = new ArrayList<Host>(); Elements hosts = doc.select("ul#HosterList").select("li"); for (Element host : hosts) { int hosterId = 0; Set<String> classes = host.classNames(); for (String c : classes) { if (c.startsWith("MirStyle")) { hosterId = Integer.valueOf(c.substring("MirStyle".length())); } } String name = host.select("div.Named").text(); String count = host.select("div.Data").text(); int c = 1; if (count.contains("/")) { count = count.substring(count.indexOf("/") + 1, count.indexOf(" ", count.indexOf("/"))); c = Integer.valueOf(count); } for (int i = 0; i < c; i++) { Host h = Host.selectById(hosterId); h.setName(name); h.setMirror(i + 1); if (h.isEnabled()) { hostlist.add(h); } } } item.setMirrors(hostlist.toArray(new Host[hostlist.size()])); } String imdb = doc.select("div.IMDBRatingLinks > a").attr("href").trim(); if (!TextUtils.isEmpty(imdb)) { imdb = imdb.replace("/", ""); item.setImdbId(imdb); } return item; }
/** * Extract content with jsoup maybe later. * * @param doc * @return */ public static List<Item> extractItem(Document doc) { List<Item> itemList = new ArrayList<Item>(); Elements itemRows = doc.select("tr"); Iterator iterator = itemRows.iterator(); while (iterator.hasNext()) { Element element = (Element) iterator.next(); Element titleElement = element.select(".title a").first(); if (titleElement == null) { continue; } String titleStr = titleElement.text().trim(); String urlStr = titleElement.attr("href").trim(); Element comHeadElement = element.select(".comhead").first(); if (comHeadElement == null) { continue; } String comheadStr = comHeadElement.text().trim(); Element pointsElement = element.select("span[id^=score_]").first(); if (pointsElement == null) { continue; } String pointsStr = pointsElement.text(); if (pointsStr == null) { continue; } String[] pointsArr = pointsStr.split(" "); if (pointsArr.length != 2) { continue; } int points = -1; try { points = Integer.parseInt(pointsArr[0]); } catch (NumberFormatException e) { } if (points < 0) { continue; } Element userElement = element.select("a[href^=user]").first(); if (userElement == null) { continue; } String user = userElement.text().trim(); Element dateElement = element.select(".subtext").first(); } return itemList; }
@Override public void process(ResultItems page) { Document doc = (Document) page.getResource(); Elements elements = doc.select("div.txt-list-category-v2"); for (Element item : elements) { String ancestorName = item.select("h3").text(); String ancestorId = item.attr("id"); CategoryEntity ancestor = new CategoryEntity().setName(ancestorName).setSite(SiteName.Taobao).setCode(ancestorId); getLogger().trace(ancestor); page.addItem(ancestor); Elements subElements = item.select("a"); CategoryEntity parent = null; for (Element item3rd : subElements) { if (item3rd.attr("href").isEmpty()) { String name = item3rd.text().trim(); if (name.isEmpty()) { continue; } if (name.toCharArray()[0] == 160) { continue; } parent = new CategoryEntity().setName(name).setSite(SiteName.Taobao).setParent(ancestor); getLogger().trace(parent); page.addItem(parent); } else { String url = item3rd.absUrl("href"); try { url = java.net.URLDecoder.decode(url, "utf-8"); } catch (UnsupportedEncodingException e) { throw new RuntimeException(url, e); } String name = item3rd.text().trim(); if (name.isEmpty()) { continue; } CategoryEntity grand = new CategoryEntity() .setName(name) .setUrl(url) .setSite(SiteName.Taobao) .setParent(parent); if (parent == null) { throw new RuntimeException("no parent of " + grand); } getLogger().trace(grand); page.addItem(grand); } } } }
private static PharmacieResultatParsingSousCategorie parserSousCategorie( Category subCategory, boolean premierAppel, Document sousCategorie) { PharmacieResultatParsingSousCategorie resultat = new PharmacieResultatParsingSousCategorie(); Element productList = sousCategorie.select("div.products-list").first(); if (productList != null) { Elements productsInfo = productList.select("div.info_product"); if (productsInfo != null) { for (Element productInfo : productsInfo) { Element productInfoLink = productInfo.select("a.name-link").first(); if (productInfoLink != null) { String productInfoLinkUrl = productInfoLink.attributes().get("href").toLowerCase(); resultat.getListeUrlArticlesTrouves().add(productInfoLinkUrl); } } } if (premierAppel) { resultat .getListeAutrePagesAParserMemeCategorie() .addAll(verificationAutrePageAParser(sousCategorie)); } } else { Element categoryList = sousCategorie.select("div.category-list").first(); if (categoryList != null) { Elements nouvellesSousCategorie = categoryList.select("h2.category-title"); if (nouvellesSousCategorie != null) { for (Element nouvelleSousCategorie : nouvellesSousCategorie) { Element productInfoLink = nouvelleSousCategorie.select("a").first(); String nouvelleSousCategorieLink = productInfoLink.attributes().get("href").toLowerCase(); String nouvelleSousCategorieName = nouvelleSousCategorie.text(); // On crée la nouvelle catégorie List<Category> nouvelleListeCategorie = new ArrayList<Category>(); Category newCategory = new Category(nouvelleSousCategorieName, nouvelleSousCategorieLink); newCategory.setParentCategory(subCategory); nouvelleListeCategorie.add(newCategory); if (!resultat.getMapAutresSousCategories().containsKey(subCategory)) { resultat.getMapAutresSousCategories().put(subCategory, nouvelleListeCategorie); } else { resultat .getMapAutresSousCategories() .get(subCategory) .add(new Category(nouvelleSousCategorieName, nouvelleSousCategorieLink)); } } } } } return resultat; }
@Override protected void onPostExecute(String s) { super.onPostExecute(s); for (Element e : doc.select( "div.wrap.ch_clip._cardArea div.cds_area._infiniteCardArea div.cds._MM_CARD")) { String Title = e.select("div.cds_type.uio_thumb dl.cds_info dt.title h3 span").text(); String Date = e.select("div.cds_type.uio_thumb dl.cds_info dd.meta span.time").text(); Item_CardList_Ted data = new Item_CardList_Ted(Title, Date); listCardItems.add(data); } cardItemAdapter.notifyDataSetChanged(); }
public List<News> scrape(Document doc) { Elements trs = doc.select("body > center > " + "table > tbody > tr > td > " + "table > tbody > tr"); int num = 0; List<News> newsList = new ArrayList<>(); News.Builder builder = null; out: for (Element tr : trs) { switch (num % 3) { case 1: Elements titles = tr.select(".title"); if (titles.size() < 2) { break out; } builder = new News.Builder(); Element titleEl = titles.get(1); Element a = titleEl.select("a").first(); builder.title = a.text(); builder.url = getUrl(a); Elements comhead = titleEl.select(".comhead"); if (comhead.size() > 0) { String domain = comhead.first().text(); builder.domain = extract(domain, DOMAIN); } break; case 2: assert builder != null; Element subtext = tr.select(".subtext").first(); Elements els = subtext.select("a"); if (els.size() > 1) { Element comments = els.get(1); builder.id = getId(comments); builder.points = getPoints(subtext); builder.commentsNum = getCommentsNum(comments); } newsList.add(builder.build()); break; } num++; } return newsList; }
public JSONArray toFourDayJSON(String html, String[] labels) { Document doc = Jsoup.parse(html); JSONArray dates = new JSONArray(); try { Elements tables = doc.select("table"); // Log.d("jsoup", "Four day: Parsing html table: " + tables.size()); for (Element table : tables) { Elements rows = table.select("tr"); JSONObject date_item = new JSONObject(); JSONArray row_json = new JSONArray(); for (Element row : rows) { Elements data = row.select("td"); if (!data.isEmpty()) { JSONObject details = new JSONObject(); for (Element dataItem : data) { Elements img = dataItem.select("img"); String img_src = null, label = null; String[] tokens = null; if (img.size() == 0) { label = labels[data.indexOf(dataItem)]; if (label.equals("Time")) { details.put(label, dataItem.text().split("-")[0]); } else { details.put(label, dataItem.text()); } } else { img_src = img.get(0).attr("src"); tokens = img_src.split("/"); // Log.d("jsoup", "img: "+tokens[tokens.length-1]); details.put(labels[data.indexOf(dataItem)], tokens[tokens.length - 1]); } } row_json.put(details); } } date_item.put("data", new JSONArray(row_json.toString())); date_item.put("date", table.previousElementSibling().text()); dates.put(new JSONObject(date_item.toString())); } } catch (JSONException e) { e.printStackTrace(); } return dates; }
/** * Parse search results from a search result site * * @param pUrl */ private void parseSearchResults(String pUrl) { LOGGER.info("Started parsing: " + pUrl); Document doc = null; doc = ParserUtils.connectGetUrl(ParserUtils.getUri(pUrl).toASCIIString()); doc.setBaseUri(DEFAULT_VSP_URL); Elements results = doc.select("div[class*=map-list-item]"); for (Element result : results) { PersistentEntity ent = new PersistentEntity(); Elements infoElement = result.select("div[class*=info-content]"); LOGGER.debug(infoElement.select("p[class*=establishment-category]").first().ownText()); String tmp = result .select("div[class*=info-content]") .select("p[class*=establishment-category]") .first() .ownText(); ent.setIndustry(new Utf8(tmp.split("/")[0])); ent.setLabel(new Utf8(tmp)); // getting same as value to where it is EylloLink link = ParserUtils.detectUrl( infoElement.select("p[class*=establishment-name]").select("a").first()); if (link != null) { LOGGER.debug(DEFAULT_VSP_URL + link.getLinkHref()); ent.putToSameAs( new Utf8(DEFAULT_VSP_URL + link.getLinkHref()), new Utf8(link.getLinkText())); ent.setName(new Utf8(link.getLinkText())); } // getting its address and phone PersistentPoint point = new PersistentPoint(); infoElement = result.select("div[class*=establishment-details]").select("p"); ent.addToTelephones(new Utf8(infoElement.get(0).ownText())); point.setAddress(new Utf8(infoElement.get(0).text())); if (!result.attr("data-lng").toString().equals("") && !result.attr("data-lat").toString().equals("")) { // Format in [lon, lat], note, the order of lon/lat here in order to conform with GeoJSON. point.addToCoordinates(Double.parseDouble(result.attr("data-lng"))); point.addToCoordinates(Double.parseDouble(result.attr("data-lat"))); point.setAccuracy(EylloLocation.GEOCODER_VERIF_ACC_HIGH); } ent.setPersistentpoint(point); ent.addToScenarioId(getScenarioId()); this.pEntities.add(ent); } LOGGER.info("Completed getting basic information from entities."); }
public void test1() throws Exception { Document dom = Jsoup.connect("http://book.douban.com/latest?icn=index-latestbook-all").get(); // 根据jquery Elements es = dom.select("#content li").not(".clear"); for (int i = 0; i < es.size(); i++) { Element e = es.get(i); // li String title = e.select("h2").get(0).text(); System.out.println("title:" + title); Elements esp = e.select("p"); Element p1 = esp.get(0); String auth = p1.text(); System.out.println("auth:" + auth); String text = esp.get(1).text(); System.out.println(text); final String url = e.select("img").get(0).attr("src"); System.out.println(url); new Thread() { public void run() { try { String fileName = url.substring(url.lastIndexOf("/") + 1); HttpURLConnection con = (HttpURLConnection) new URL(url).openConnection(); con.setConnectTimeout(3000); con.setRequestMethod("GET"); con.setDoInput(true); con.connect(); int code = con.getResponseCode(); if (code == 200) { InputStream in = con.getInputStream(); byte[] b = new byte[1024]; int len = 0; OutputStream out = new FileOutputStream("f:/" + fileName); while ((len = in.read(b)) != -1) { out.write(b, 0, len); } out.close(); } con.disconnect(); } catch (Exception e) { e.printStackTrace(); } }; }.start(); System.out.println("-------------------"); } System.in.read(); }
@Test public void deeperDescendant() { String h = "<div class=head><p><span class=first>Hello</div><div class=head><p class=first><span>Another</span><p>Again</div>"; Document doc = Jsoup.parse(h); Element root = doc.getElementsByClass("head").first(); Elements els = root.select("div p .first"); assertEquals(1, els.size()); assertEquals("Hello", els.first().text()); assertEquals("span", els.first().tagName()); Elements aboveRoot = root.select("body p .first"); assertEquals(0, aboveRoot.size()); }
private List<Ingredient> grabIngredients(Document doc) { List<Ingredient> ingredientsList = new ArrayList<Ingredient>(); Element table = doc.select("table[class=zutaten]").first(); Iterator<Element> ite = table.select("tr[class=ingredient]").iterator(); while (ite.hasNext()) { Element ingredient = ite.next(); String amount = ingredient.select("td[class=nobr amount]").first().text(); String name = ingredient.select("td[class=name]").first().text(); ingredientsList.add(new Ingredient(amount, name)); } return ingredientsList; }
@Override protected Boolean doInBackground(String... params) { try { Document doc = Jsoup.connect(params[0]).get(); Element body = doc.body(); Elements titleEs = body.select("td.title"); Elements subTitleEs = body.select("td.subtext"); int index = 1; if (!titleEs.isEmpty()) { if (mType == TYPE_REFRESH && mNews.size() > 0) { mNews.clear(); } Iterator<Element> iterator = titleEs.iterator(); Iterator<Element> subIt = subTitleEs.iterator(); NewEntity entity = null; User user = null; while (iterator.hasNext()) { Element e = iterator.next(); if (index % 2 == 0) { Element subE = subIt.next(); Elements aTag = e.select("a"); Elements spanTag = e.select("span.comhead"); Elements subEa = subE.select("a"); user = new User(); user.setId(subEa.get(0).text()); entity = new NewEntity( aTag.get(0).attr("href"), aTag.get(0).text(), spanTag.isEmpty() ? null : spanTag.get(0).text(), subE.html()); entity.setDiscussUrl(subEa.get(1).attr("href")); // Log.i(LOG_TAG, entity.toString()); mNews.add(entity); } index++; } } Elements more = doc.getElementsByAttributeValueStarting("href", "/x?fnid="); if (!more.isEmpty()) { mMoreURLPath = more.get(1).attr("href"); } return true; } catch (IOException e) { Log.e(LOG_TAG, "", e); return false; } }