@Override public HSDeck getDeckDetail(final HSDeck hsDeck, final float n) { try { final Document value = Jsoup.connect(HPDeckSource.BASE_URL + hsDeck.getUrl()).get(); final Elements select = value.select("section.class-listing table.listing td.col-name"); final HashMap<String, String> classHsItemMap = new HashMap<String, String>(); final ArrayList<String> list = new ArrayList<String>(); for (int i = 0; i < select.size(); ++i) { final String text = select.get(i).select("a").get(0).text(); classHsItemMap.put( text, select.get(i).text().trim().substring(select.get(i).text().trim().length() - 1)); list.add(text); } hsDeck.setClassHsItemMap(classHsItemMap); hsDeck.setClassHsItemList(DataBaseManager.getInstance().getAllCardsByNames(list)); final Elements select2 = value.select("section.neutral-listing table.listing td.col-name"); final HashMap<String, String> neutralHsItemMap = new HashMap<String, String>(); final ArrayList<String> list2 = new ArrayList<String>(); for (int j = 0; j < select2.size(); ++j) { final String text2 = select2.get(j).select("a").get(0).text(); neutralHsItemMap.put( text2, select2.get(j).text().trim().substring(select2.get(j).text().trim().length() - 1)); list2.add(text2); } hsDeck.setNeutralHsItemMap(neutralHsItemMap); hsDeck.setNeutralHsItemList(DataBaseManager.getInstance().getAllCardsByNames(list2)); hsDeck.setDescription( HtmlHelper.parseDescription(value.select("div.deck-description").html(), n, false)); return hsDeck; } catch (IOException ex) { ex.printStackTrace(); return hsDeck; } }
public Date getStartDate(StockInfo stock) throws Exception { String path = "/corp/go.php/vMS_MarketHistory/stockid/" + stock.numberToString() + ".phtml"; URI uri = new URIBuilder() .setScheme("http") .setHost("vip.stock.finance.sina.com.cn") .setPath(path) .setParameter("year", "1980") .setParameter("jidu", "1") .build(); DownloadHelper download = new DownloadHelper(uri); InputStream is = download.getInputStream(); Document doc = Jsoup.parse(inputStreamToStringBuilder(is).toString()); is.close(); download.close(); Elements select = doc.getElementsByAttributeValue("name", "year"); if (select == null) { return null; } // System.out.println(select.size()); Elements years = select.get(0).getElementsByTag("option"); String year = years.get(years.size() - 1).text(); // System.out.println(year); return Date.quarterToDate(Integer.parseInt(year), tryQuarter(stock, year)); }
/** * 解析数据,默认解析第一列 * * @param rows 源数据集 * @return 节目数据 */ private static String[][] parseRows(Elements rows) { String[][] programs = new String[rows.size()][2]; int rowspan_0 = 0; int rowspan_1 = 0; for (int i = 0; i < rows.size(); i++) { Element row = rows.get(i); try { Elements cells = row.children(); if (rowspan_0 == 0) { Element cell_0 = cells.get(0); rowspan_0 = Integer.valueOf(cell_0.attr("rowspan")); if (rowspan_1 == 0) { Element cell_1 = cells.get(1); rowspan_1 = Integer.valueOf(cell_1.attr("rowspan")); programs[i][0] = DBclass.xmlFilte(cell_1.select("dt").text()); programs[i][1] = DBclass.xmlFilte(cell_1.select("dd").text()); } } else if (rowspan_1 == 0) { Element cell_0 = cells.get(0); rowspan_1 = Integer.valueOf(cell_0.attr("rowspan")); programs[i][0] = DBclass.xmlFilte(cell_0.select("dt").text()); programs[i][1] = DBclass.xmlFilte(cell_0.select("dd").text()); } rowspan_0--; rowspan_1--; } catch (Exception e) { e.printStackTrace(System.out); } } return programs; }
private void initPane() { // WebEngine engine = optionView.getEngine(); try { Document document = Jsoup.connect(webView.getEngine().getLocation()).get(); Element table = document.select("#normal_basket_" + document.select("[name=item_id]").val()).first(); Element td = table.select("td").first(); Elements spans = td.select("span"); Elements selects = td.select("select"); // System.out.println(spans.size()); cmb = new ArrayList<ComboBox>(); for (int i = 0; i < spans.size(); i++) { ObservableList<ValuePair> obs = FXCollections.observableArrayList(); Elements options = selects.get(i).select("option"); for (int k = 0; k < options.size(); k++) { Element option = options.get(k); obs.add(new ValuePair("choice", option.text(), option.val())); } cmb.add(new ComboBox<ValuePair>(obs)); optionArea.getChildren().addAll(new Text(spans.get(i).text()), cmb.get(i)); } } catch (Exception e) { // TODO 自動生成された catch ブロック e.printStackTrace(); } }
public Chapter createChapter(int id, String page) { Chapter chapter = new Chapter(id); chapter.setUrl(Constants.BASE_URL + getVersion() + page); String cache = getCachePath() + page; try { String html = client.requestWithCache(chapter.getUrl(), cache, client.METHOD_GET, null); Document chapterDoc = Jsoup.parse(html); // 取出内容 Elements tables = chapterDoc.select("table"); int tableIndexOfMainBody = 1; if (tables.size() == 1) { tableIndexOfMainBody = 0; } Element table = chapterDoc.select("table").get(tableIndexOfMainBody); Elements sectionElements = table.select("td[class=v]"); logger.debug(sectionElements.size()); for (Element tdIndex : sectionElements) { Element tdContent = tdIndex.nextElementSibling(); String section = tdContent.text(); logger.debug(section); chapter.addSection(section); } } catch (IOException e) { logger.error(e.getMessage()); } catch (IndexOutOfBoundsException e) { logger.error(e.getMessage()); } return chapter; }
/** Mudah is not standardized, result will be messy if crawl them */ @Override public List<Item> parse(String query, int size) throws IOException { // request for a page Document doc = Jsoup.connect("http://www.mudah.my/li?q=" + query) .userAgent(Constant.HTTP_USER_AGENT) .timeout(Constant.HTTP_TIMEOUT) .get(); Elements listS = doc.select("div.listing_thumbs").first().select("div.list_ads"); ArrayList<Item> result = new ArrayList<Item>(size); for (int i = 0; i < listS.size(); i++) { Element list = listS.get(i); String img = ""; list.select("div.image_thumb"); Elements imgS = list.select("div.image_thumb > a + img"); if (imgS.size() < 0) { // some may not have images img = imgS.first().attr("href"); } Element listE = list.select("li.listing_ads_title").first(); String title = listE.child(0).text(); String url = listE.child(0).attr("href"); String price = listE.text(); price = price.substring(price.lastIndexOf("RM") + 2).trim().replaceAll(" ", ""); int dPrice = Integer.parseInt(price); result.add(new Item("Mudah", title, dPrice, img, url)); } return result; }
public List<AreaVO> parseMessage(String text, int pid) { Document doc = Jsoup.parse(text); Element body = doc.body(); List<AreaVO> areas = new ArrayList<AreaVO>(); Elements divs = body.getElementsByClass("subarea"); if (divs.size() > 0) { Element div = divs.get(0); Elements childs = div.children(); String letter = ""; for (int i = 1; i < childs.size(); i++) { Element child = childs.get(i); if ("b".equals(child.tagName())) { letter = child.text(); continue; } if ("a".equals(child.tagName())) { AreaVO area = new AreaVO(); area.setLetter(letter); area.setName(child.text()); area.setOrderIdx(index); area.setPid(pid); String href = child.attr("href"); String pinyin = href.substring(7, href.lastIndexOf("/")); area.setPinyin(pinyin); index++; System.out.println(area.toString()); areas.add(area); } } } return areas; }
public static void main(String[] args) throws IOException { // Validate.isTrue(args.length == 1, "usage: supply url to fetch"); // String url = args[0]; // String url = "http://www.hao123.com"; String url = "http://www.iteye.com/login"; print("Fetching %s...", url); Document doc = Jsoup.connect(url).get(); Elements links = doc.select("a[href]"); Elements media = doc.select("[src]"); Elements imports = doc.select("link[href]"); print("\nMedia: (%d)", media.size()); for (Element src : media) { if (src.tagName().equals("img")) print( " * %s: <%s> %sx%s (%s)", src.tagName(), src.attr("abs:src"), src.attr("width"), src.attr("height"), trim(src.attr("alt"), 20)); else print(" * %s: <%s>", src.tagName(), src.attr("abs:src")); } print("\nImports: (%d)", imports.size()); for (Element link : imports) { print(" * %s <%s> (%s)", link.tagName(), link.attr("abs:href"), link.attr("rel")); } print("\nLinks: (%d)", links.size()); for (Element link : links) { print(" * a: <%s> (%s)", link.attr("abs:href"), trim(link.text(), 35)); } }
private void processEntry( @NotNull String queryString, @NotNull Element entryNode, @NotNull BilingualQueryResultBuilder resultBuilder, @NotNull Language sourceLanguage, @NotNull Language targetLanguage) { if (!StringUtils.equals(entryNode.tag().getName(), "tr")) { LOGGER.warn("Expected <tr> tag - got <{}>", entryNode.tag().getName()); return; } Elements words = entryNode.getElementsByClass("words"); if (words.size() != 2) { LOGGER.warn("Expected 2 elements with class \"words\" - got {}", words.size()); return; } BilingualEntryBuilder entryBuilder = ImmutableBilingualEntry.builder(); entryBuilder.setEntryType(detectEntryType(words.get(0))); entryBuilder.setInputObject(processSingleNode(words.get(0), sourceLanguage, queryString)); entryBuilder.setOutputObject(processSingleNode(words.get(1), targetLanguage, queryString)); resultBuilder.addBilingualEntry(entryBuilder.build()); }
private Integer searchResults(Document document) { Integer occurences = 0; String searchResult = "0"; Elements searchResults = document.select("h2.page-title.hidden-xs"); if (searchResults.size() == 0) { // Sometimes results come in a different place, check it searchResults = document.select("div#resultsCountHeader h1.fnt12"); } if (searchResults.size() > 0) { searchResult = searchResults.get(0).text().split(" ")[0]; } // When the result is more than 1000 we get 1000+, so we delete the + sign if (searchResult.endsWith("+")) { searchResult = searchResult.substring(0, searchResult.length() - 1); } try { // We deal with results like 'Zero' or 'Sorry, none job...' occurences = Integer.parseInt(searchResult.replace(",", "")); } catch (NumberFormatException e) { System.out.println("Error parsing:" + searchResult); occurences = 0; } return occurences; }
public String reviseImgForWX(String pcont) { if (pcont == null) return ""; Document doc = Jsoup.parse(pcont); Elements eleimages = doc.select("img"); if (eleimages.size() > 0) { for (Element img : eleimages) { String source = img.attr("data-src"); int pos = source.lastIndexOf("/") + 1; source = source.substring(0, pos); img.removeAttr("data-s"); img.removeAttr("data-src"); img.removeAttr("data-w"); img.attr("src", source + "640"); img.attr("max-width", "640"); } } Elements elesrp = doc.select("script"); Elements divs = doc.select("div"); if (elesrp.size() > 0 && divs.size() > 0) { for (Element ele : elesrp) { String s = ele.html(); Pattern p = Pattern.compile("(?<=(var\\scover\\s=\\s\"))\\S+(?=\")"); Matcher m = p.matcher(s); if (m.find()) { String nimg = "<img src=\"" + m.group() + "\"/>"; divs.get(0).before(nimg); } } } return doc.html(); }
public Holder doParse(String html, String url) { Holder holder = new Holder(); holder.url = url; Document doc = Jsoup.parse(html, url); Elements typeElement = doc.select("body > div.main_w.clearfix > div.main.clearfix > ul > li:nth-child(5) > a"); holder.dishType = typeElement.text(); Elements titleElement = doc.select( "body > div.main_w.clearfix > div.main.clearfix > div.cp_header.clearfix > div.cp_main_info_w > div.info1 > h1 > a"); holder.title = titleElement.text(); Elements methodElement = doc.select( "body > div.main_w.clearfix > div.main.clearfix > div.cp_header.clearfix > div.cp_main_info_w > div.info2 > ul > li:nth-child(1) > a"); holder.method = methodElement.text(); Elements materialElement = doc.select( "body > div.main_w.clearfix > div.main.clearfix > div.cp_body.clearfix > div.cp_body_left > div.materials > div > div.yl.zl.clearfix > ul > li > div > h4 > a"); holder.mainMaterial = materialElement.text(); Elements stepE = doc.select( "body > div.main_w.clearfix > div.main.clearfix > div.cp_body.clearfix > div.cp_body_left > div.measure > div.editnew.edit > div.content.clearfix"); // // body > div.main_w.clearfix > div.main.clearfix > div.cp_body.clearfix // > div.cp_body_left > div.measure > div.editnew.edit > // div.content.clearfix // body > div.main_w.clearfix > div.main.clearfix > div.cp_body.clearfix // > div.cp_body_left > div.measure > div.edit > p:nth-child(1) > em // if (stepE.size() == 0) { stepE = doc.select( "body > div.main_w.clearfix > div.main.clearfix > div.cp_body.clearfix > div.cp_body_left > div.measure > div.edit > p"); } for (int i = 0; i < stepE.size(); i++) { Element e = stepE.get(i); if (e.children().hasClass("step")) { String step = e.text(); if (!"".equals(step)) { holder.steps.add(step); } } } // body > div.main_w.clearfix > div.main.clearfix > div.cp_body.clearfix // > div.cp_body_left > div.measure > div.editnew.edit > // div:nth-child(1) return holder; }
private void saveSmsLog( SimpleObject context, final int page, final int t, final Date d, final String dstr, final int isHistory) { String text = ContextUtil.getContent(context); Document doc = ContextUtil.getDocumentOfContent(context); System.out.println(doc.toString()); if (text.indexOf("没有查找到相关数据") >= 0) { return; } String tableSort = InfoUtil.getInstance().getInfo("dx/sh", "tableSort"); String tbody = InfoUtil.getInstance().getInfo("dx/sh", "tbody"); String tr = InfoUtil.getInstance().getInfo("dx/sh", "tr"); String td = InfoUtil.getInstance().getInfo("dx/sh", "td"); Elements elements = doc.select(tableSort); if (elements != null && elements.size() > 0) { Elements elements2 = elements.first().select(tbody).first().select(tr); for (int j = 0; j < elements2.size(); j++) { try { Elements tds = elements2.get(j).select(td); if (tds.size() == 5) { String RecevierPhone = tds.get(2).text().trim(); // 对方号码 String SentTime = tds.get(1).text().trim(); // 发送时间 String BusinessType = tds.get(3).text().trim(); // 费用类型 String AllPay = tds.get(4).text().trim(); // 费用 Date sentTime = null; try { sentTime = DateUtils.StringToDate(SentTime, "yyyy-MM-dd HH:mm:ss"); } catch (Exception e) { e.printStackTrace(); } TelcomMessage obj = new TelcomMessage(); obj.setPhone(phoneNo); UUID uuid = UUID.randomUUID(); obj.setId(uuid.toString()); obj.setBusinessType(BusinessType); // 业务类型:点对点 obj.setRecevierPhone(RecevierPhone); // 对方号码 obj.setSentTime(sentTime); // 发送时间 obj.setCreateTs(new Date()); obj.setAllPay(Double.parseDouble(AllPay)); // 总费用 messageList.add(obj); } } catch (Exception e) { logger.error("saveSmsLog", e); } } if (text.contains("下一页")) { requestSmsLogService(page + 1, 1, d, dstr, isHistory); } } }
private void parseFeedItem(String resource) { try { Document doc = Jsoup.parse(resource); Element masthead = doc.select("div.tie-wrapper").first(); Elements feedBoxs = masthead.select("div.tie-box"); for (int i = 0; i < feedBoxs.size(); i++) { FeedItem feedItem = new FeedItem(); Element feedPost = feedBoxs.get(i); Element titleElement = feedPost.select("div.tie-header h2.tie-title a").first(); Element nameElement = feedPost.select("div.tie-content div.tie-user div.user-info p span.user-name").first(); Element sourceElement = feedPost.select("div.tie-content div.tie-user div.user-info p span.user-form").first(); Element timestampElement = feedPost.select("div.tie-content div.tie-user div.user-info p.tie-date").first(); Elements imageElement = feedPost.select("div.tie-content img.st-photo"); Elements contentElements = feedPost.select("div.tie-content p:not(.tie-date):gt(0)"); String title = titleElement.text(); String name = nameElement.text(); String source = sourceElement.text(); String timestamp = timestampElement.text(); String content = ""; for (int j = 0; j < contentElements.size(); j++) { content = content + contentElements.get(j).text() + "\n"; } String image; if (imageElement.attr("src") != "") { image = url + imageElement.attr("src"); } else { image = null; } feedItem.setTitle(title); feedItem.setName(name); feedItem.setPostTime(timestamp); feedItem.setSource(source); feedItem.setImage(image); feedItem.setContent(content); mFeedItems.add(feedItem); } } catch (Exception e) { e.printStackTrace(); } mFeedItemAdapter.notifyDataSetChanged(); }
/** * Constructs a component hierarchy from the design specified as an html tree. * * <p>If a component root is given, the component instances created during reading the design are * assigned to its member fields based on their id, local id, and caption * * @param doc the html tree * @param componentRoot optional component root instance. The type must match the type of the root * element in the design. * @param classWithFields a class (componentRoot class or a super class) with some member fields. * The member fields whose type is assignable from {@link Component} are bound to fields in * the design based on id/local id/caption */ private static DesignContext designToComponentTree( Document doc, Component componentRoot, Class<?> classWithFields) { DesignContext designContext = new DesignContext(doc); designContext.readPackageMappings(doc); // No special handling for a document without a body element - should be // taken care of by jsoup. Element root = doc.body(); Elements children = root.children(); if (children.size() > 1) { throw new DesignException( "The first level of a component hierarchy should contain at most one root component, but found " + children.size() + "."); } Element element = children.size() == 0 ? null : children.first(); if (componentRoot != null) { if (element == null) { throw new DesignException( "The root element cannot be null when the specified root Component is" + " not null."); } // user has specified root instance that may have member fields that // should be bound final FieldBinder binder; try { binder = new FieldBinder(componentRoot, classWithFields); } catch (IntrospectionException e) { throw new DesignException("Could not bind fields of the root component", e); } // create listener for component creations that binds the created // components to the componentRoot instance fields ComponentCreationListener creationListener = new ComponentCreationListener() { @Override public void componentCreated(ComponentCreatedEvent event) { binder.bindField(event.getComponent(), event.getLocalId()); } }; designContext.addComponentCreationListener(creationListener); // create subtree designContext.readDesign(element, componentRoot); // make sure that all the member fields are bound Collection<String> unboundFields = binder.getUnboundFields(); if (!unboundFields.isEmpty()) { throw new DesignException("Found unbound fields from component root " + unboundFields); } // no need to listen anymore designContext.removeComponentCreationListener(creationListener); } else { // createChild creates the entire component hierarchy componentRoot = element == null ? null : designContext.readDesign(element); } designContext.setRootComponent(componentRoot); return designContext; }
/** * get search results * * @param url * @return */ public static List<Movie> getSearchResult(Document doc) { Elements elem = doc.getElementsByAttributeValue("class", "article"); Elements movies = elem.get(0).getElementsByTag("table"); if (movies == null || movies.size() == 0) { return null; } ArrayList<Movie> results = new ArrayList<Movie>(); for (int i = 0; i < movies.size(); i++) { Movie m = new Movie(); Elements tds = movies.get(i).getElementsByTag("td"); String imgURL = tds.get(0).html(); String doubanURL = imgURL.substring(imgURL.indexOf("http://movie.douban.com/subject/")); doubanURL = doubanURL.substring(0, doubanURL.indexOf("\"")); m.setDoubanUrl(doubanURL); Log.d("doubanURL:", doubanURL); String mID = doubanURL.replace("http://movie.douban.com/subject/", ""); mID = mID.replace("/", ""); m.setID(Integer.parseInt(mID)); Log.d("movie id", mID); imgURL = imgURL.substring(imgURL.indexOf("<img src=\"")); imgURL = imgURL.replace("<img src=\"", ""); imgURL = imgURL.substring(0, imgURL.indexOf("\"")); m.setImgUrl(imgURL); Log.d("imgURL", imgURL); String name = ""; Elements title = tds.get(1).getElementsByAttributeValue("class", "pl2"); List<Node> nodes = title.get(0).childNodes(); List<Node> n = nodes.get(0).childNodes(); name += n.get(0).toString().toString().replace("/", ""); Log.d("movie name", name); m.setMovieName(name); String intro = ""; Elements info = tds.get(1).getElementsByAttributeValue("class", "pl"); intro = info.get(0).childNodes().get(0).toString(); m.setDirActHtml(intro); results.add(m); } return results; }
public List<News> scrape(Document doc) { Elements trs = doc.select("body > center > " + "table > tbody > tr > td > " + "table > tbody > tr"); int num = 0; List<News> newsList = new ArrayList<>(); News.Builder builder = null; out: for (Element tr : trs) { switch (num % 3) { case 1: Elements titles = tr.select(".title"); if (titles.size() < 2) { break out; } builder = new News.Builder(); Element titleEl = titles.get(1); Element a = titleEl.select("a").first(); builder.title = a.text(); builder.url = getUrl(a); Elements comhead = titleEl.select(".comhead"); if (comhead.size() > 0) { String domain = comhead.first().text(); builder.domain = extract(domain, DOMAIN); } break; case 2: assert builder != null; Element subtext = tr.select(".subtext").first(); Elements els = subtext.select("a"); if (els.size() > 1) { Element comments = els.get(1); builder.id = getId(comments); builder.points = getPoints(subtext); builder.commentsNum = getCommentsNum(comments); } newsList.add(builder.build()); break; } num++; } return newsList; }
public String reviseImgForBundpic(String pcont) { if (pcont == null) return ""; Document doc = Jsoup.parse(pcont); Elements eleimages = doc.select("div#list_image_div>img"), divs = doc.select("div#list_image_div"); if (divs.size() > 0 && eleimages.size() > 0) { eleimages.get(0).removeAttr("style"); divs.get(0).removeAttr("style"); } return doc.html(); }
private void parseLoginStep4(SimpleObject context) { Document doc = ContextUtil.getDocumentOfContent(context); Elements e1 = doc.select("form#c2000004"); if (e1.size() > 0) { data.put("errMsg", e1.select("td#status2").text()); setStatus(STAT_STOPPED_FAIL); notifyStatus(); return; } e1 = doc.select("form#login_form"); if (e1.size() > 0) { data.put("errMsg", "登录失败,请重试!"); setStatus(STAT_STOPPED_FAIL); notifyStatus(); return; } String text = ContextUtil.getContent(context); String url = StringUtil.subStr( "<script type='text/javascript'>location.replace('", "');</script>", text); if (StringUtils.isBlank(url.trim())) { if ("IBM HTTP Server".equalsIgnoreCase(doc.select("title").text())) { setStatus(STAT_LOGIN_SUC); // notifyStatus(); ssoLogin(context); } else { data.put("fail", true); setStatus(STAT_STOPPED_FAIL); notifyStatus(); logger.error("Login Fail....."); } return; } getUrl( url, null, new Object[] {UAM_CHAR_SET}, new AbstractProcessorObserver(util, WaringConstaint.ZGDX_5) { @Override public void afterRequest(SimpleObject context) { setStatus(STAT_LOGIN_SUC); ssoLogin(context); } }); }
public static boolean getFormFields( ResponseWrapper rw, List<NameValuePairString> hiddenFormFields, String formSelector) { // --- analisi della pagina contente la form, specifica al sito Document doc = rw.getJSoupDocument(); Elements els = doc.select(formSelector); // per debug, dovrebbe essere uo if (els == null || els.size() <= 0) { log.error("unable to find form at selector: " + formSelector); System.exit(1); return false; } Element loginForm = els.get(0); if (loginForm == null) { log.error("failed to get form to analyze at: " + rw.dump()); System.exit(1); } // log.info("login form OUTER HTML\n" + loginForm.outerHtml()); Elements inputFields = loginForm.select("input"); // display all for (Element e : inputFields) { String type = e.attr("type"); if (type.equals("submit")) { continue; } String attrName = e.attr("name"); hiddenFormFields.add(new NameValuePairString(attrName, e.val())); log.debug("captured form input: " + attrName + " = " + e.val()); } return false; }
@Scheduled(fixedDelay = 900000) public void loadBooksInfo() { int booksListPageNumber = 1; while (true) { Document document = null; try { URL url = new URL( "http://www.labirint.ru/genres/2308/?page=" + Integer.toString(booksListPageNumber++)); document = Jsoup.parse(url, 5000); } catch (MalformedURLException ex) { } catch (IOException ex) { } Elements elements = document.getElementsByClass("product"); if (elements.size() == 0) break; Iterator<Element> iterator = elements.iterator(); while (iterator.hasNext()) { try { bookDao.saveBook(bookInfo(iterator.next())); } catch (Exception ex) { if (ex instanceof ConstraintViolationException) continue; } } } }
public static void readHead() { String url = "http://www.2177s.com"; try { Document doc = Jsoup.connect(url).timeout(10000).get(); String title = doc.title(); System.out.printf("title:%s\n", title); // Elements eles = doc.select("meta[name~=(?i)keywords|(?i)description]"); Elements eles = doc.select("meta"); System.out.println(eles.size()); for (Element ele : eles) { if (StringUtils.containsIgnoreCase(url, title)) ; if (ele.toString().matches(".*(?i)keywords.*")) { System.out.println(ele.attr("content")); } // System.out.println(ele.attr("content")); } // Elements eles = doc.getElementsByTag("meta"); // for (Element ele : eles) { // System.out.printf("keys:%s\n", ele.attr("keywords")); // System.out.printf("desc:%s\n", ele.attr("description")); // System.out.println("----------------"); // } doc = null; } catch (Exception e) { e.printStackTrace(); } }
/** * begin crawling with a specific url use depth first search * * @throws IOException * @throws SQLException */ public void crawl(String starturl) throws IOException, SQLException { if (urlid >= MAXURL) // base case return; Document doc; try { doc = Jsoup.connect(starturl).get(); } catch (IOException e) { // if the url is not valid, stop the crawling process return; } catch (IllegalArgumentException e) { System.out.println("Must supply a valid URL : " + starturl); return; } if (!urlList.contains(starturl)) { urlList.add(starturl); } // if the url has already been crawled else if (urlList.contains(starturl)) { return; } Elements hrefs = doc.select("a"); urlid += 1; // terminate the process if there is no more link in a webpage if (hrefs == null || hrefs.size() == 0) return; HashMap<String, Integer> wordMap = parseHTML(getHTMLContent(starturl)); insertDBWord(starturl, wordMap, urlid); insertDBDescription(starturl, topOneHundred(starturl), urlid); for (Element e : hrefs) { String href = e.attr("href"); crawl(href); // depth first search; } }
@Override public List<String> parseCategory(String categoryName, String categoryURL) { // TODO Auto-generated method stub List<String> linksByCategoryList = null; try { Document doc = Jsoup.connect(categoryURL).timeout(Constants.MAX_DELAY_TIME * 1000).get(); Elements links = doc.select("div[class=views-field views-field-title]").select("a"); if (links != null && links.size() > 0) { linksByCategoryList = new ArrayList<String>(); for (Element element : links) { String newsLink = element.attr("href"); newsLink = newsLink.substring(1); linksByCategoryList.add(newsLink); } } } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } return linksByCategoryList; }
@Override public String fire(String inputContent) throws Exception { validate(); Document document = Jsoup.parse(inputContent); Elements elements = document.select(cssSelector); return (elements != null && elements.size() > 0 ? elements.html().trim() : null); }
@Override public void populateMetaData(MetaData metaData) throws MetaDataException { Document doc; try { if (method.equals("GET")) { doc = Jsoup.connect(url).get(); } else if (method.equals("POST")) { doc = Jsoup.connect(url).data(requestData).post(); } else { throw new MetaDataException("Unsupported HTML access method: " + method); } for (MetaDataAttribute attribute : attributes) { Elements elements = doc.select(attribute.getQuery()); if (elements.size() > 0) { String sValue = elements.get(0).text(); Object oValue = attribute.getValueMapper().parse(sValue); metaData.put(attribute.getName(), oValue); } } } catch (IOException e) { throw new MetaDataException(e); } catch (ValueMapperException e) { throw new MetaDataException(e); } }
public HashMap<String, String> initialBestBuyScan(Document doc, String url) { doc = jsoupConnect(url); HashMap<String, String> matchingItems = new HashMap<String, String>(); matchingItems.put("price", doc.select(".medium-item-price").text()); matchingItems.put( "modelNumber", doc.select(".list-item-info .sku-model ul .model-number").text()); matchingItems.put("title", doc.select(".list-item-info .sku-title h4 a").text()); String newURL = "http://bestbuy.com" + bestBuySpecsFormatter(doc.select(".list-item-info .sku-title h4 a").attr("href")); System.out.println(newURL); doc = jsoupConnect(newURL); Elements tableEles = doc.select("#full-specifications table tbody tr"); for (Element ele : tableEles) { if (ele.text().contains("UPC")) { matchingItems.put("upc", ele.text().replace("UPC ", "")); break; } } if (tableEles.size() < 1) matchingItems.put("GoodSKU", "false"); else matchingItems.put("GoodSKU", "true"); doc.empty(); return matchingItems; }
@Override public void run() { // TODO Auto-generated method stub Document doc = null; Elements eles = null; if (!Utils.isNET(NewsContentActivity.this)) { Utils.showToast(NewsContentActivity.this, "网络不可用哦,亲!", Toast.LENGTH_SHORT); } else { try { doc = Jsoup.connect(url).timeout(8000).get(); if (null == doc) { Utils.showToast(NewsContentActivity.this, "网络不给力哦,亲,请返回再进入吧!", Toast.LENGTH_SHORT); return; } eles = doc.select("#Cnt-Main-Article-QQ P"); StringBuilder sb = new StringBuilder(); for (int i = 0; i < eles.size(); i++) { sb.append(eles.get(i).outerHtml()); } Message msg = new Message(); Bundle bundle = new Bundle(); bundle.putString("content", sb.toString()); Log.i("content", sb.toString()); msg.setData(bundle); msg.what = NewsContentActivity.NEWCONTENTRECEIVED; myHandler.sendMessage(msg); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } }
private static String replaceCidWithAttachments( String html, Map<String, Attachment> attachments) { Document doc = Jsoup.parse(html); String[] attrNames = {"src", "href"}; for (String attrName : attrNames) { Elements tags = doc.select("*[" + attrName + "]"); for (Element tag : tags) { String uriString = tag.attr(attrName).trim(); if (!uriString.toLowerCase().startsWith("cid:")) { continue; } String cid = uriString.substring("cid:".length()); if (!attachments.containsKey(cid)) { continue; } Long id = attachments.get(cid).id; tag.attr(attrName, controllers.routes.AttachmentApp.getFile(id).url()); } } Elements bodies = doc.getElementsByTag("body"); if (bodies.size() > 0) { return bodies.get(0).html(); } else { return doc.html(); } }
@Bean public IntegrationFlow evernoteIntegration() { return IntegrationFlows.from( this.evernoteMessageSource(), configurer -> configurer.poller(Pollers.fixedRate(pollIntervalInSeconds, TimeUnit.SECONDS))) .channel(this.inputChannel()) .filter(Collection.class, source -> !source.isEmpty()) .split() .transform( Note.class, source -> { String content = source.getContent(); if (StringUtils.isNotBlank(content)) { Document enmlDocument = Jsoup.parse(content); Elements noteElements = enmlDocument.select("en-note"); if (noteElements.size() == 1) { Element noteElement = noteElements.get(0); String wordsFromNote = noteElement.text(); if (StringUtils.isNotBlank(wordsFromNote)) { return wordsFromNote; } } } return source.getTitle(); }, configurer -> configurer.requiresReply(false)) .filter(source -> source != null) .channel(wordRequestsChannel) .get(); }