@Override protected void parseRow( final String query, final int options, final Element tr, final List<Name> results) { final String thumbnailUrl = tr.getElementsByAttributeValue("class", "primary_photo") .first() .getElementsByTag("img") .first() .attr("src"); final Element r = tr.getElementsByAttributeValue("class", "result_text").first(); final Element a = r.getElementsByTag("a").first(); final String url = Imdb.BASE_URL + a.attr("href"); final String name = a.ownText(); String job = ""; Reference ref = null; final Elements smalls = r.getElementsByTag("small"); if (!smalls.isEmpty()) { final String refUrl = Imdb.BASE_URL + smalls.first().getElementsByTag("a").first().attr("href"); String desc = smalls.first().text(); if (desc.startsWith("(") && desc.endsWith(")")) desc = desc.substring(1, desc.length() - 1); final int comma = desc.indexOf(','); if (comma != -1) { job = desc.substring(0, comma).trim(); ref = new Reference(refUrl, desc.substring(comma + 1).trim()); } else { if (desc.matches(".+\\(\\d+\\)")) ref = new Reference(refUrl, desc.substring(comma + 1).trim()); else job = desc; } } results.add(new Name(url, thumbnailUrl, name, job, ref)); }
@Test public void getNamespacedElementsByTag() { Document doc = Jsoup.parse("<div><abc:def id=1>Hello</abc:def></div>"); Elements els = doc.getElementsByTag("abc:def"); assertEquals(1, els.size()); assertEquals("1", els.first().id()); assertEquals("abc:def", els.first().tagName()); }
@Override protected List<Tome> parseTomes(Document htmlDocument, Serie parent) { Date today = new Date(); List<Tome> tomes = new LinkedList<>(); Elements divChapters = htmlDocument.select("div.detail_list"); if (!divChapters.isEmpty()) { Elements spansLeft = divChapters.first().select("span.left"); if (!spansLeft.isEmpty()) { for (Element span : spansLeft) { Elements tomeNumberElements = span.select("span.mr6"); final String tomeNumberString = StringUtils.substringAfter(tomeNumberElements.first().text(), "Vol "); int tomeNumber = 0; if (tomeNumberString != null && !tomeNumberString.isEmpty()) { Integer.parseInt(tomeNumberString); } Tome foundTome = null; for (Tome tome : tomes) { if (tomeNumber == tome.getNumber()) { foundTome = tome; break; } } if (foundTome == null) { Tome tome = new Tome(); tome.setNumber(tomeNumber); tome.setName("Tome " + tomeNumber); tome.setMustBeSaved(true); tome.setValidityDate(today); tome.setSerie(parent); tomes.add(tome); foundTome = tome; } Element link = span.select("a").first(); Chapter chapter = new Chapter(); chapter.setMustBeSaved(true); chapter.setUrl(link.attr("href")); String chapterNumberToParse = link.text(); String tempNumber = StringUtils.substringAfterLast(chapterNumberToParse, " "); chapter.setNumber(Float.parseFloat(tempNumber)); chapter.setName(span.text()); chapter.setTome(foundTome); foundTome.addChapter(chapter); } } } parent.setValidityDate(today); return tomes; }
/** * 解析回帖列表 * * @param content * @return */ public static List<Post> parsePostList(String content) { long s = System.currentTimeMillis(); List<Post> posts = new ArrayList<Post>(); Document document = Jsoup.parse(content); document.setBaseUri(Constants.BASE_URL); Elements elements = document.getElementsByClass("plc"); for (Element plc : elements) { try { Post post = new Post(); // 解析头像 Element avatar = plc.getElementsByClass("avatar").first(); post.setAvatarUrl(avatar.child(0).absUrl("src")); String authi = plc.getElementsByClass("authi").first().html(); Element message = plc.getElementsByClass("message").first(); post.setContent(message.html().trim()); // // 解析头像 // // Element avatar = plc.getElementsByClass("avatar").first(); // Element avatar = plc.child(0); // post.setAvatarUrl(avatar.child(0).absUrl("src")); // // // Element message = plc.getElementsByClass("message").first(); // Element display = plc.child(1); // String authi = display.child(0).html(); // Element message = display.child(1); // post.setContent(message.html().trim()); try { // 主贴没有replyUrl String replyUrl = plc.getElementsByClass("replybtn").first().child(0).absUrl("href"); post.setReplyUrl(replyUrl); } catch (Exception e) { } Elements img_list = plc.getElementsByClass("img_list"); if (img_list != null && !img_list.isEmpty()) { String imgList = img_list.first().html(); post.setImgList(imgList); } else { // 单张图片附件时 Elements img_one = plc.getElementsByClass("img_one"); if (img_one != null && !img_one.isEmpty()) { String imgOne = img_one.first().html(); post.setImgList(imgOne); } } post.setAuthi(authi); posts.add(post); } catch (Exception e) { } LogMessage.i("parsePostList", "解析时间:" + (System.currentTimeMillis() - s)); } return posts; }
public void download(Connection aInConnection, Collection<Image> images) throws IOException { aInConnection.url(url); Document lDocument = aInConnection.get(); Element lMain = lDocument.getElementById("main"); Elements lContents = lMain.getElementsByClass("content"); if (lContents.size() == 1) { StringBuilder sb = new StringBuilder(); Element lContent = lContents.first(); collectImages(lContent, images); Elements lLightboxElements = lContent.getElementsByClass("lightbox"); for (Element lLightboxElement : lLightboxElements) { Collection<Node> lImageNodes = extractImageNodes(lLightboxElement); Element lParent = lLightboxElement.parent(); int i = lLightboxElement.siblingIndex(); lParent.insertChildren(i, lImageNodes); lLightboxElement.remove(); } Elements lChildElements = lContent.children(); for (Element lChildElement : lChildElements) { if (lChildElement.hasClass("clear")) { // no more post content break; } if (title == null && lChildElement.tagName().equals("h1")) { // the first h1 header is the title title = lChildElement.html(); } else { if (excerpt == null && lChildElement.tagName().equals("p")) { excerpt = lChildElement.text(); } String lStr = lChildElement.toString(); sb.append(lStr); } } content = sb.toString(); Elements lDateElements = lContent.getElementsByClass("date"); String lHunDate = lDateElements.first().html(); date = new PostDate(lHunDate); } else { System.out.println("More than one content in main section of post page " + toString()); } }
@Override public Object parseHtml2Obj(String html) { Document doc = Jsoup.parse(html); Element title = doc.getElementById("activity-name"); Element createtime = doc.getElementById("post-date"); // Element from = doc.getElementById("post-user"); Element content = doc.getElementById("essay-body"); Elements pic = doc.select("#media img"); Elements _intro = doc.select(".text p"); String intro = null; if (_intro.isEmpty()) { intro = "阅读全部"; } else { intro = _intro.first().text(); } // List<ArticleObj> objs = new ArrayList<ArticleObj>(); ArticleObj obj = new ArticleObj(); obj.setFrom(account_desc); obj.setContent(content.html()); obj.setCreatetime(createtime.text()); obj.setTitle(title.text()); obj.setIntro(intro.substring(0, intro.length() > 50 ? 50 : intro.length()) + "..."); if (!pic.isEmpty()) { String src = pic.get(0).attr("src"); obj.setPic(getSrc(src)); } System.err.println(obj.getPic()); dbRobot.AddArticleData(obj); cur_count++; return null; }
/** Mudah is not standardized, result will be messy if crawl them */ @Override public List<Item> parse(String query, int size) throws IOException { // request for a page Document doc = Jsoup.connect("http://www.mudah.my/li?q=" + query) .userAgent(Constant.HTTP_USER_AGENT) .timeout(Constant.HTTP_TIMEOUT) .get(); Elements listS = doc.select("div.listing_thumbs").first().select("div.list_ads"); ArrayList<Item> result = new ArrayList<Item>(size); for (int i = 0; i < listS.size(); i++) { Element list = listS.get(i); String img = ""; list.select("div.image_thumb"); Elements imgS = list.select("div.image_thumb > a + img"); if (imgS.size() < 0) { // some may not have images img = imgS.first().attr("href"); } Element listE = list.select("li.listing_ads_title").first(); String title = listE.child(0).text(); String url = listE.child(0).attr("href"); String price = listE.text(); price = price.substring(price.lastIndexOf("RM") + 2).trim().replaceAll(" ", ""); int dPrice = Integer.parseInt(price); result.add(new Item("Mudah", title, dPrice, img, url)); } return result; }
public boolean contentMatch(String selector, String regexp) { Elements es = document().select(selector); if (es.size() == 1) { lastMatch = (Pattern.compile(regexp, Pattern.MULTILINE)).matcher(es.first().text()); if (!lastMatch.matches()) { addViolation(String.format("DOM要素 '%s' のテキストが正規表現 '%s' にマッチしません", selector, regexp)); return wrap(false); } } else { Pattern p = Pattern.compile(regexp, Pattern.MULTILINE); boolean match = false; for (Element e : es) { if (p.matcher(e.text()).matches()) { match = true; break; } } if (!match) { addViolation( String.format("DOM要素 '%s' の中に、テキストが正規表現 '%s' にマッチするものが見付かりません", selector, regexp)); return wrap(false); } } return wrap(true); }
@Override public void parse(String result, Task task) throws Exception { List<ECBean> beans = new ArrayList<ECBean>(); JSONObject parseObject = JSON.parseObject(result); Object object = parseObject.get("value"); Document doc = Jsoup.parse((String) object); Elements eles = doc.select("div.mod_search_pro"); String categroy = ""; for (Element element : eles) { ECBean bean = new ECBean("yhd"); Elements select = element.select("p.proName > a"); String url = select.first().attr("href"); String id = select.first().attr("pmid"); // 过滤没有pid数据 if (id.equals("0")) { continue; } String name = select.first().attr("title"); // 抓取分类 if (StringUtils.isBlank(categroy)) { Document document = GlobalComponents.fetcher.document(url); Elements select2 = document.select("div.crumb > a"); StringBuilder sb = new StringBuilder(); for (int i = 1; i < select2.size() - 1; i++) { sb.append(select2.get(i).text()); } sb.deleteCharAt(sb.length() - 1); categroy = sb.toString(); categroy = StringUtils.replaceChars(categroy, "", "/"); } bean.setId(id); bean.setUrl(url); bean.setTitle(name); bean.setCategory(categroy); bean.setKeyword(task.getExtra()); beans.add(bean); } log.info("fetch list:" + beans.size()); if (!beans.isEmpty()) { for (ECBean bean : beans) { bean.saveOnNotExist(); } } beans.clear(); }
@Test public void testGetElementsWithAttributeDash() { Document doc = Jsoup.parse( "<meta http-equiv=content-type value=utf8 id=1> <meta name=foo content=bar id=2> <div http-equiv=content-type value=utf8 id=3>"); Elements meta = doc.select("meta[http-equiv=content-type], meta[charset]"); assertEquals(1, meta.size()); assertEquals("1", meta.first().id()); }
/** * 提取每一场演出的票价 * * @param url 演出url */ private void extractEach(String url) { Show show = new Show(); try { show.setAgent_id(agentID); Document ticket = getDoc(url); show.setType(typeCor.get(ticket.select("a.font12hui_bottom:eq(2)").text().trim())); show.setName(ticket.select("td.PERFORM_BOLD_NAME").text()); // 演出标题 // 演出简介 show.setIntroduction( PubFun.cleanElement(ticket.select("body>table").get(6).select("table").get(3)).html()); show.setSiteName(ticket.select(".font12hui:contains(演出场馆)").text().replace("演出场馆:", "")); show.setImage_path(ticket.select("img[width=240]").first().attr("abs:src")); Map<String, List<TicketPrice>> timeAndPrice = new HashMap<String, List<TicketPrice>>(); show.setTimeAndPrice(timeAndPrice); for (Element each : ticket.select("body>table").get(6).select("table tr[id^=perform_price_line]")) { Elements tmp = each.select("td"); String time = tmp.get(1).text(); if (time.length() == 18) { // 正常时间 time = time.substring(0, 16); } List<TicketPrice> ticketPrice = new ArrayList<TicketPrice>(); timeAndPrice.put(time, ticketPrice); int priceIndex = 2; if (tmp.size() > 3) { // 含有套票 String[] prices = tmp.get(priceIndex).select("span.font14lanse").text().split("\\s+"); for (int i = 0; i < prices.length; i++) { Elements a = tmp.get(priceIndex).select("span.font14lanse a:matches(\\b" + prices[i] + "\\b)"); TicketPrice price = new TicketPrice(); price.setMainURL(url); price.setPrice(prices[i]); price.setExist(!a.isEmpty()); if (price.isExist()) { price.setRemark(a.first().attr("title")); } ticketPrice.add(price); } priceIndex = 3; } String[] prices = tmp.get(priceIndex).select("span.font14lanse").text().split("\\s+"); for (int i = 0; i < prices.length; i++) { // 正常的非套票 Elements a = tmp.get(priceIndex).select("span.font14lanse a:matches(\\b" + prices[i] + "\\b)"); TicketPrice price = new TicketPrice(); price.setMainURL(url); price.setPrice(prices[i]); price.setExist(!a.isEmpty()); ticketPrice.add(price); } } getDao().saveShow(show); } catch (Exception e) { log.error(url, e); } }
private Elements parseFirst(String query) { if (!FIRST_TAG.equals(query)) { throw new IllegalArgumentException("Argument selector part: " + query + " is illegal"); } else { Elements eles = new Elements(); eles.add(elements.first()); return eles; } }
public void doAnylyze(String content, UserProfile userprofile) { String picture_url = ""; int tweet = 0, following = 0, follower = 0; String location = null, selfIntroductionstr = null; Document doc = Jsoup.parse(content, "/"); Elements picture = doc.getElementsByAttributeValueContaining( "class", "profile-picture media-thumbnail js-nav js-tooltip"); if (picture.size() > 0) { picture_url = picture.get(0).child(0).attr("src"); } else { picture_url = "null"; } Elements locationElements = doc.getElementsByAttributeValue("class", "location profile-field"); if (locationElements != null && locationElements.size() > 0) { location = locationElements.first().ownText(); } else { location = "null"; } Elements selfIntroduction = doc.getElementsByAttributeValue("class", "bio profile-field"); if (selfIntroduction != null && selfIntroduction.size() > 0) { selfIntroductionstr = selfIntroduction.first().ownText(); } else { selfIntroductionstr = "null"; } Elements CountElement = doc.getElementsByAttributeValue("class", "default-footer"); if (CountElement != null && CountElement.size() > 0) { Element target = CountElement.first(); tweet = this.getCount(target, "tweet_stats"); following = this.getCount(target, "following_stats"); follower = this.getCount(target, "follower_stats"); } else { tweet = -1; following = -1; follower = -1; } userprofile.setTweet(tweet); userprofile.setFollower(follower); userprofile.setFollowing(following); userprofile.setPicture_url(picture_url); userprofile.setLocation(location); userprofile.setSelfintroduction(selfIntroductionstr); }
private void saveSmsLog( SimpleObject context, final int page, final int t, final Date d, final String dstr, final int isHistory) { String text = ContextUtil.getContent(context); Document doc = ContextUtil.getDocumentOfContent(context); System.out.println(doc.toString()); if (text.indexOf("没有查找到相关数据") >= 0) { return; } String tableSort = InfoUtil.getInstance().getInfo("dx/sh", "tableSort"); String tbody = InfoUtil.getInstance().getInfo("dx/sh", "tbody"); String tr = InfoUtil.getInstance().getInfo("dx/sh", "tr"); String td = InfoUtil.getInstance().getInfo("dx/sh", "td"); Elements elements = doc.select(tableSort); if (elements != null && elements.size() > 0) { Elements elements2 = elements.first().select(tbody).first().select(tr); for (int j = 0; j < elements2.size(); j++) { try { Elements tds = elements2.get(j).select(td); if (tds.size() == 5) { String RecevierPhone = tds.get(2).text().trim(); // 对方号码 String SentTime = tds.get(1).text().trim(); // 发送时间 String BusinessType = tds.get(3).text().trim(); // 费用类型 String AllPay = tds.get(4).text().trim(); // 费用 Date sentTime = null; try { sentTime = DateUtils.StringToDate(SentTime, "yyyy-MM-dd HH:mm:ss"); } catch (Exception e) { e.printStackTrace(); } TelcomMessage obj = new TelcomMessage(); obj.setPhone(phoneNo); UUID uuid = UUID.randomUUID(); obj.setId(uuid.toString()); obj.setBusinessType(BusinessType); // 业务类型:点对点 obj.setRecevierPhone(RecevierPhone); // 对方号码 obj.setSentTime(sentTime); // 发送时间 obj.setCreateTs(new Date()); obj.setAllPay(Double.parseDouble(AllPay)); // 总费用 messageList.add(obj); } } catch (Exception e) { logger.error("saveSmsLog", e); } } if (text.contains("下一页")) { requestSmsLogService(page + 1, 1, d, dstr, isHistory); } } }
@Test public void testHasText() { Document doc = Jsoup.parse("<div><p>Hello</p><p></p></div>"); Element div = doc.select("div").first(); Elements ps = doc.select("p"); assertTrue(div.hasText()); assertTrue(ps.first().hasText()); assertFalse(ps.last().hasText()); }
/** * Constructs a component hierarchy from the design specified as an html tree. * * <p>If a component root is given, the component instances created during reading the design are * assigned to its member fields based on their id, local id, and caption * * @param doc the html tree * @param componentRoot optional component root instance. The type must match the type of the root * element in the design. * @param classWithFields a class (componentRoot class or a super class) with some member fields. * The member fields whose type is assignable from {@link Component} are bound to fields in * the design based on id/local id/caption */ private static DesignContext designToComponentTree( Document doc, Component componentRoot, Class<?> classWithFields) { DesignContext designContext = new DesignContext(doc); designContext.readPackageMappings(doc); // No special handling for a document without a body element - should be // taken care of by jsoup. Element root = doc.body(); Elements children = root.children(); if (children.size() > 1) { throw new DesignException( "The first level of a component hierarchy should contain at most one root component, but found " + children.size() + "."); } Element element = children.size() == 0 ? null : children.first(); if (componentRoot != null) { if (element == null) { throw new DesignException( "The root element cannot be null when the specified root Component is" + " not null."); } // user has specified root instance that may have member fields that // should be bound final FieldBinder binder; try { binder = new FieldBinder(componentRoot, classWithFields); } catch (IntrospectionException e) { throw new DesignException("Could not bind fields of the root component", e); } // create listener for component creations that binds the created // components to the componentRoot instance fields ComponentCreationListener creationListener = new ComponentCreationListener() { @Override public void componentCreated(ComponentCreatedEvent event) { binder.bindField(event.getComponent(), event.getLocalId()); } }; designContext.addComponentCreationListener(creationListener); // create subtree designContext.readDesign(element, componentRoot); // make sure that all the member fields are bound Collection<String> unboundFields = binder.getUnboundFields(); if (!unboundFields.isEmpty()) { throw new DesignException("Found unbound fields from component root " + unboundFields); } // no need to listen anymore designContext.removeComponentCreationListener(creationListener); } else { // createChild creates the entire component hierarchy componentRoot = element == null ? null : designContext.readDesign(element); } designContext.setRootComponent(componentRoot); return designContext; }
private int getCount(Element ele, String dataElementTerm) throws NumberFormatException { Elements allElements = ele.getElementsByAttributeValue("data-element-term", dataElementTerm); if (allElements != null && allElements.size() > 0) { Element target = allElements.first(); String count = target.child(0).ownText(); count = count.replaceAll(",", ""); int res = Integer.parseInt(count); return res; } return -1; }
@Override public Date extract(Element rootElement) { // this belongs in a separate class as a proper Date Extractor. But... // for now this knows how to pull dates out of mnartists articles Elements body = rootElement.select("div[class=articleBody]"); try { if (body.size() > 0) { Elements ems = body.first().select("em"); if (ems.size() > 0) { String em = ems.first().text(); String date_str = string.isNullOrEmpty(em) ? string.empty : em.trim(); logger.error(date_str); Date date = new SimpleDateFormat("MMM dd, yyyy").parse(date_str); return date; } } } catch (java.text.ParseException e) { // ignore it } return null; }
@Before public void init() throws IOException { if (elem == null) { URL url = new URL( "http://newhouse.hfhouse.com/HouseList/index/keyWord/%20%E4%B8%AD%E6%B5%B7%E6%BB%A8%E6%B9%96%E5%85%AC%E9%A6%86/"); URLConnection conn = url.openConnection(); String result = IOUtils.toString(conn.getInputStream(), "utf-8"); Document doc = Jsoup.parse(result); Elements loupanList = doc.getElementsByAttributeValue("class", "loupan_list_none"); elem = loupanList.first(); } }
public List<News> scrape(Document doc) { Elements trs = doc.select("body > center > " + "table > tbody > tr > td > " + "table > tbody > tr"); int num = 0; List<News> newsList = new ArrayList<>(); News.Builder builder = null; out: for (Element tr : trs) { switch (num % 3) { case 1: Elements titles = tr.select(".title"); if (titles.size() < 2) { break out; } builder = new News.Builder(); Element titleEl = titles.get(1); Element a = titleEl.select("a").first(); builder.title = a.text(); builder.url = getUrl(a); Elements comhead = titleEl.select(".comhead"); if (comhead.size() > 0) { String domain = comhead.first().text(); builder.domain = extract(domain, DOMAIN); } break; case 2: assert builder != null; Element subtext = tr.select(".subtext").first(); Elements els = subtext.select("a"); if (els.size() > 1) { Element comments = els.get(1); builder.id = getId(comments); builder.points = getPoints(subtext); builder.commentsNum = getCommentsNum(comments); } newsList.add(builder.build()); break; } num++; } return newsList; }
@Test public void should_save_url_when_redirected_to_login() { UserFlow navigation = createQuestionPage(navigate()); VRaptorTestResult navigationResult = navigation.followRedirect().execute(); navigationResult.wasStatus(200).isValid(); Elements redirectInput = getElementsByAttributeAndValue(navigationResult, "name", "redirectUrl"); String redirectUrl = redirectInput.first().attr("value"); String expectedUrl = rootPath(navigationResult).concat("/perguntar"); assertThat(redirectUrl, equalTo(expectedUrl)); }
@Test public void insertChildrenAsCopy() { Document doc = Jsoup.parse("<div id=1>Text <p>One</p> Text <p>Two</p></div><div id=2></div>"); Element div1 = doc.select("div").get(0); Element div2 = doc.select("div").get(1); Elements ps = doc.select("p").clone(); ps.first().text("One cloned"); div2.insertChildren(-1, ps); assertEquals(4, div1.childNodeSize()); // not moved -- cloned assertEquals(2, div2.childNodeSize()); assertEquals( "<div id=\"1\">Text <p>One</p> Text <p>Two</p></div><div id=\"2\"><p>One cloned</p><p>Two</p></div>", TextUtil.stripNewlines(doc.body().html())); }
public boolean content(String selector, String text) { Elements es = document().select(selector); if (es.stream().noneMatch(e -> e.hasText() && e.text().trim().equals(text))) { if (es.size() == 1) { addViolation( String.format( "DOM要素 '%s' に文字列 '%s' がセットされているはずですが '%s' となっています", selector, text, es.first().text())); return wrap(false); } else { addViolation(String.format("DOM要素 '%s' で文字列 '%s' をもつものが見付かりません", selector, text)); return wrap(false); } } return wrap(true); }
public String getAlbumTitle(URL url) throws MalformedURLException { try { // Attempt to use album title as GID if (albumDoc == null) { logger.info(" Retrieving " + url.toExternalForm()); sendUpdate(STATUS.LOADING_RESOURCE, url.toString()); albumDoc = Jsoup.connect(url.toExternalForm()).userAgent(USER_AGENT).timeout(TIMEOUT).get(); } Elements elems = albumDoc.select(".albumName"); return HOST + "_" + elems.first().text(); } catch (Exception e) { // Fall back to default album naming convention logger.warn("Failed to get album title from " + url, e); } return super.getAlbumTitle(url); }
private EntryType detectEntryType(@NotNull Element element) { Elements wordTypeNodes = element.getElementsByClass("wordType"); if (wordTypeNodes.size() < 1) { LOGGER.debug("No wordType node found - defaulting to {}", EntryType.UNKNOWN); return EntryType.UNKNOWN; } EntryType entryType = ENTRY_TYPE_MAP.getOrDefault(wordTypeNodes.first().text(), EntryType.UNKNOWN); if (entryType == EntryType.UNKNOWN) LOGGER.debug("Unable to resolve entry type \"{}\"", entryType); return entryType; }
@Override public void initialize(URL location, ResourceBundle resources) { urlField.setOnAction( event -> { String text = urlField.getText(); urlField.setText("tetetetetetetete"); webView.getEngine().load(text); }); webView .getEngine() .getLoadWorker() .stateProperty() .addListener( (ov, oldState, newState) -> { if (newState == State.SUCCEEDED) { String url = webView.getEngine().getLocation(); urlField.setText(url); if (Pattern.compile("http://item.rakuten.co.jp/.*").matcher(url).find()) { try { Elements tmp; Document document = Jsoup.connect(url).get(); tmp = document.select("input"); tmp = tmp.select("#etime"); if (tmp.size() != 0) { if (!(Long.parseLong(tmp.first().val()) < new Date().getTime())) { entryButton.setDisable(false); } } else { entryButton.setDisable(false); } } catch (Exception e) { // TODO 自動生成された catch ブロック e.printStackTrace(); } } } ; }); entryButton.setOnAction( event -> { urlField.setText("webView disable"); sendEntryTaskController(); }); }
private Elements parseNextElement(String query) { if (!NEXT_ELEMENT_TAG.equals(query)) { throw new IllegalArgumentException("Argument selector part: " + query + " is illegal"); } else { Elements eles = new Elements(); if (elements.size() == 1) { Element element = elements.first().nextElementSibling(); if (element == null) { return eles; } eles.add(element); } else { eles = elements; } return eles; } }
static DetailledItem parseDetail(String id, Document doc, JSONObject data) throws OpacErrorException, JSONException { if (doc.select("p.error, p.errorMsg, .alert-error").size() > 0) { throw new OpacErrorException(doc.select("p.error, p.errorMsg, .alert-error").text()); } DetailledItem res = new DetailledItem(); res.setId(id); Elements title = doc.select(".record h1, .record [itemprop=name], .record [property=name]"); if (title.size() > 0) { res.setTitle(title.first().text()); } for (Element img : doc.select(".record img, #cover img")) { String src = img.absUrl("src"); if (src.contains("over")) { if (!src.contains("Unavailable")) { res.setCover(src); } break; } } for (Element tr : doc.select(".record table").first().select("tr")) { String text = tr.child(1).text(); if (tr.child(1).select("a").size() > 0) { String href = tr.child(1).select("a").attr("href"); if (!href.startsWith("/") && !text.contains(data.getString("baseurl"))) { text += " " + href; } } res.addDetail(new Detail(tr.child(0).text(), text)); } try { if (doc.select("#Volumes").size() > 0) { parseVolumes(res, doc, data); } else { parseCopies(res, doc, data); } } catch (JSONException e) { e.printStackTrace(); } return res; }
public List<DictionaryEntry> extractEntries(Document doc) { Elements tables = doc.body().select("table"); if (tables.size() > 1) { return Collections.emptyList(); } List<DictionaryEntry> dictionary = new ArrayList<DictionaryEntry>(); Elements entries = tables.first().select("td[class]"); for (Element entry : entries) { String word = entry.ownText(); String meta = entry.select("i").first().text(); Elements meanings = entry.select("b"); for (Element meaning : meanings) { String meaningText = meaning.text(); dictionary.add(new DictionaryEntry(word, meaningText, meta)); } } return dictionary; }
private static int parseAlumni() throws Exception { int count = 0; // Processes all pages, following "next" links. String url = START_URL; while (url != null) { // Opens the page and extracts the HTML DOM structure into Jsoup. System.out.printf("Parsing %s...%n", url); Document doc = Jsoup.connect(url).get(); url = null; // Looks for the first table in the document, where the names of the students are supposed to // be. Element table = doc.select(SELECTOR_TABLE).first(); // Extracts the rows from the table. Goes through all of them. Elements rows = table.select(SELECTOR_ROW); for (Element row : rows) { // Extracts the columns from the row. Elements columns = row.select(SELECTOR_COLUMN); // Read the columns with useful information. if (!columns.isEmpty()) { count++; Element nameCell = columns.get(COLUMN_NAME); Element defenseDateCell = columns.get(COLUMN_DEFENSE_DATE); Element levelCell = columns.get(COLUMN_LEVEL); // Also extracts the link to the detail page of the alumni. String link = baseUrl + nameCell.select(SELECTOR_DETAIL_LINK).attr(ATTRIBUTE_LINK); // Creates and stores the alumni in the set. alumni.add(new Alumnus(nameCell.text(), defenseDateCell.text(), levelCell.text(), link)); } } // Checks if there's a next page. Elements nextLinks = doc.select(SELECTOR_NEXT_LINK); if (!nextLinks.isEmpty()) url = nextLinks.first().attr(ATTRIBUTE_LINK); if (url != null && url.startsWith("/")) url = baseUrl + url; } return count; }