/** * Gets the content of the article, and creates the final section for the generated report (html) * * @param aLink * @return */ public static String getArticleContent(String aLink, ArticleBodyParser parser) { Document doc = null; String htmltext = ""; if (Objects.nonNull(aLink) && !aLink.isEmpty()) { try { doc = Jsoup.connect(aLink) .header("Accept-Encoding", "gzip, deflate") .userAgent(userAgent) .timeout(6000) .followRedirects(true) .maxBodySize(0) .get(); } catch (IOException e) { LOGGER.log( Level.WARNING, "Error connecting, while fetching the article with link " + aLink); } if (doc != null) { Element title = doc.getElementsByTag("title").first(); Element mainArticle = parser.parseArticleFromDoc(doc); if (Objects.nonNull(title) && Objects.nonNull(mainArticle)) { htmltext = HtmlContentWriterUtil.generateArticleHtml( title.html(), aLink, Jsoup.clean(mainArticle.html(), Whitelist.basic())); } else { LOGGER.warning("We could not fetch the title and main body for link: " + aLink); } } } return htmltext; }
/** * Pulls a page and attempts to discover a feed for it via link[rel='alternate']. * * @param url The URL of the page to try and discover the feed for. * @return The feedsource if matched or created, may be null. * @throws ClientProtocolException If the page could not be pulled. * @throws IOException If the page could not be pulled. * @throws DataOperationException If a query could not be executed. */ public FeedSource discover(final String url) throws ClientProtocolException, IOException, DataOperationException { log.fine("Discovering feed for " + url); try (final CloseableHttpClient client = HttpClientBuilder.create().build()) { final HttpGet get = new HttpGet(url); try (final CloseableHttpResponse response = client.execute(get)) { final String html = EntityUtils.toString(response.getEntity()); final Document doc = Jsoup.parse(html); final Elements alternateLinks = doc.select("link"); for (final Element alternateLink : alternateLinks) { if ("alternate".equals(alternateLink.attr("rel"))) { if ("application/rss+xml".equals(alternateLink.attr("type"))) { log.fine("Found rss link " + alternateLink.attr("href")); final String rss = alternateLink.attr("href"); return this.feedSourceManager.findOrCreateByFeedUrl(rss); } log.fine("Found alternate link " + alternateLink.html()); } else { log.fine("Found link " + alternateLink.html()); } } } } return null; }
@Test public void testSetHtml() { Document doc = Jsoup.parse("<div id=1><p>Hello</p></div>"); Element div = doc.getElementById("1"); div.html("<p>there</p><p>now</p>"); assertEquals("<p>there</p><p>now</p>", TextUtil.stripNewlines(div.html())); }
@Test public void testClonesClassnames() { Document doc = Jsoup.parse("<div class='one two'></div>"); Element div = doc.select("div").first(); Set<String> classes = div.classNames(); assertEquals(2, classes.size()); assertTrue(classes.contains("one")); assertTrue(classes.contains("two")); Element copy = div.clone(); Set<String> copyClasses = copy.classNames(); assertEquals(2, copyClasses.size()); assertTrue(copyClasses.contains("one")); assertTrue(copyClasses.contains("two")); copyClasses.add("three"); copyClasses.remove("one"); assertTrue(classes.contains("one")); assertFalse(classes.contains("three")); assertFalse(copyClasses.contains("one")); assertTrue(copyClasses.contains("three")); assertEquals("", div.html()); assertEquals("", copy.html()); }
private RawBankAccount obtainBankAccountFromHtmlTableRow(String type, Element row) { if ("detail".equalsIgnoreCase(row.attr("class"))) { // detail row return null; } if ("bg0".equalsIgnoreCase(row.attr("class"))) { Log.v(TAG, "working row(" + type + "): " + row.html()); if ("Current Accounts".equalsIgnoreCase(type)) { return new RawBankAccount() .setServerId(row.child(2).text()) .setName(row.child(0).child(0).text()) .setIBAN(row.child(2).text()) .setCurrency(row.child(1).text()) .setBalance(Convert.strToFloat(row.child(3).text())) .setAvailableBalance(Convert.strToFloat(row.child(4).text())); } else if ("Cards".equalsIgnoreCase(type)) { // skip cards for now return null; } else { // unknown type return null; } } else { return null; } }
public String getDomainName(String url) throws IOException, InterruptedException { String[] split = goodGoogleSources.split(";"); for (String st : split) { if (url.contains(st)) { String request = String.format("https://www.google.com%s", url); System.out.println("request=" + request); Document doc = Jsoup.connect(request) .userAgent( "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)") .timeout(5000) .get(); Elements paragraphs = doc.select("p"); final StringBuilder sb = new StringBuilder(); String data = ""; for (Element p : paragraphs) { return p.html(); } } } return ""; }
public static void schemaList(Element list, List<Skeema> skeemas) { list.html(""); int i = 0; for (Skeema s : skeemas) { if (i < 11) { Element elm = list.appendElement("li") .appendElement("a") .attr("href", "/schema/" + s.getId().toString()) .text(s.skeemaID); } else if (i == 11) { Element elm = list.appendElement("li") .appendElement("a") .attr("href", "#") .addClass("moreSchemas") .text("MORE..."); } else { list.appendElement("li") .appendElement("a") .addClass("hiddenSchema hidden") .attr("href", "/schema/" + s.getId().toString()) .text(s.skeemaID); } i++; } }
public static void processEpub(String bookPath, String dest) throws FileNotFoundException, IOException { EpubReader reader = new EpubReader(); Book b = reader.readEpub(new FileInputStream(new File(bookPath))); String content = ""; int pagecount = 1; int tempCounter; Count cnt = new Count(0, 0); for (Resource res : b.getContents()) { content = new String(res.getData()); Document doc = Jsoup.parse(content, "UTF-8"); // http-equiv=\"content-type\" content=\"text/html; charset=utf-8\""); Element elem = new Element(Tag.valueOf("meta"), ""); elem.attr("http-equiv", "content-type"); elem.attr("content", "text/html; charset=utf-8"); doc.head().after(elem); System.out.println(doc.head().data()); Element ele = doc.body(); alterElement(ele); Count cTemp = modify(ele, cnt); cnt.setCount(cTemp.getCount()); cnt.setPgCount(cTemp.getPgCount()); doc.body().html(ele.html()); res.setData(doc.html().getBytes()); if (res.getMediaType() == null) res.setMediaType(new MediaType("html", "html")); } EpubWriter wr = new EpubWriter(); wr.write(b, new FileOutputStream(new File(dest))); }
@Test public void testAddNewText() { Document doc = Jsoup.parse("<div id=1><p>Hello</p></div>"); Element div = doc.getElementById("1"); div.appendText(" there & now >"); assertEquals("<p>Hello</p> there & now >", TextUtil.stripNewlines(div.html())); }
@Override public Object parseHtml2Obj(String html) { Document doc = Jsoup.parse(html); Element title = doc.getElementById("activity-name"); Element createtime = doc.getElementById("post-date"); // Element from = doc.getElementById("post-user"); Element content = doc.getElementById("essay-body"); Elements pic = doc.select("#media img"); Elements _intro = doc.select(".text p"); String intro = null; if (_intro.isEmpty()) { intro = "阅读全部"; } else { intro = _intro.first().text(); } // List<ArticleObj> objs = new ArrayList<ArticleObj>(); ArticleObj obj = new ArticleObj(); obj.setFrom(account_desc); obj.setContent(content.html()); obj.setCreatetime(createtime.text()); obj.setTitle(title.text()); obj.setIntro(intro.substring(0, intro.length() > 50 ? 50 : intro.length()) + "..."); if (!pic.isEmpty()) { String src = pic.get(0).attr("src"); obj.setPic(getSrc(src)); } System.err.println(obj.getPic()); dbRobot.AddArticleData(obj); cur_count++; return null; }
private void makeVariations() { Elements variations = page.select("div.route_variations"); direction = Direction.IN; for (Element directionVariation : variations) { direction.clearVariations(); for (Element variation : directionVariation.select("p")) { String variationDescription = variation .html() .replaceAll(" ", " ") .replaceAll("<(br|BR)>", "\n") .replaceAll("</?[a-zA-Z]+.*?>", "") .replaceAll("([ \t\\xA0])+", " "); if (variationDescription.length() > 3) { String[] lines = variationDescription.split("\n+"); for (String line : lines) { if (line.length() > 3) { String[] parts = line.trim().split(" ", 2); String variationInitial = parts[0]; String description = parts[1]; if (!code.equals(variationInitial)) { direction.addVariation(variationInitial, description); } } } } } direction = Direction.OUT; } }
public String reviseImgForWX(String pcont) { if (pcont == null) return ""; Document doc = Jsoup.parse(pcont); Elements eleimages = doc.select("img"); if (eleimages.size() > 0) { for (Element img : eleimages) { String source = img.attr("data-src"); int pos = source.lastIndexOf("/") + 1; source = source.substring(0, pos); img.removeAttr("data-s"); img.removeAttr("data-src"); img.removeAttr("data-w"); img.attr("src", source + "640"); img.attr("max-width", "640"); } } Elements elesrp = doc.select("script"); Elements divs = doc.select("div"); if (elesrp.size() > 0 && divs.size() > 0) { for (Element ele : elesrp) { String s = ele.html(); Pattern p = Pattern.compile("(?<=(var\\scover\\s=\\s\"))\\S+(?=\")"); Matcher m = p.matcher(s); if (m.find()) { String nimg = "<img src=\"" + m.group() + "\"/>"; divs.get(0).before(nimg); } } } return doc.html(); }
private static void parseTitle(Topic.Builder topicBuilder, Element ele) { ele = ele.select(".item_title > a").get(0); Preconditions.checkState(ele.tagName().equals("a")); String url = ele.attr("href"); topicBuilder.setId(Topic.getIdFromUrl(url)); topicBuilder.setTitle(ele.html()); }
@Test public void test01() throws Exception { String url = "http://search.jd.com/Search?keyword=OReilly&enc=utf-8&book=y&wq=OReilly"; // Connection connect = Jsoup.connect(url); // Connection.Response execute = connect.execute(); Document parse = Jsoup.parse(new URL(url), 5000); Elements elements = parse.select(".p-name em"); for (Element element : elements) { System.out.println(element.html()); } }
@Test public void testNotPretty() { Document doc = Jsoup.parse("<div> \n<p>Hello\n there\n</p></div>"); doc.outputSettings().prettyPrint(false); assertEquals( "<html><head></head><body><div> \n<p>Hello\n there\n</p></div></body></html>", doc.html()); Element div = doc.select("div").first(); assertEquals(" \n<p>Hello\n there\n</p>", div.html()); }
/** * 解析回帖列表 * * @param content * @return */ public static List<Post> parsePostList(String content) { long s = System.currentTimeMillis(); List<Post> posts = new ArrayList<Post>(); Document document = Jsoup.parse(content); document.setBaseUri(Constants.BASE_URL); Elements elements = document.getElementsByClass("plc"); for (Element plc : elements) { try { Post post = new Post(); // 解析头像 Element avatar = plc.getElementsByClass("avatar").first(); post.setAvatarUrl(avatar.child(0).absUrl("src")); String authi = plc.getElementsByClass("authi").first().html(); Element message = plc.getElementsByClass("message").first(); post.setContent(message.html().trim()); // // 解析头像 // // Element avatar = plc.getElementsByClass("avatar").first(); // Element avatar = plc.child(0); // post.setAvatarUrl(avatar.child(0).absUrl("src")); // // // Element message = plc.getElementsByClass("message").first(); // Element display = plc.child(1); // String authi = display.child(0).html(); // Element message = display.child(1); // post.setContent(message.html().trim()); try { // 主贴没有replyUrl String replyUrl = plc.getElementsByClass("replybtn").first().child(0).absUrl("href"); post.setReplyUrl(replyUrl); } catch (Exception e) { } Elements img_list = plc.getElementsByClass("img_list"); if (img_list != null && !img_list.isEmpty()) { String imgList = img_list.first().html(); post.setImgList(imgList); } else { // 单张图片附件时 Elements img_one = plc.getElementsByClass("img_one"); if (img_one != null && !img_one.isEmpty()) { String imgOne = img_one.first().html(); post.setImgList(imgOne); } } post.setAuthi(authi); posts.add(post); } catch (Exception e) { } LogMessage.i("parsePostList", "解析时间:" + (System.currentTimeMillis() - s)); } return posts; }
/** * get the movie from title from an element * * @param elem * @return */ private static String getTitle(Element elem) { String html = elem.html(); String title = ""; int start = html.indexOf("</a>"); while (html.charAt(--start) != '>') { title = html.charAt(start) + title; } // System.out.println("title:"+title); return title; }
private void getChildElement(Element parentElement, Integer level) { parentElement.html(deleteComent(parentElement.html())); // System.out.println("key:"+(level+","+parentElement.hashCode())+",value:"+parentElement.html()); if (parentElement.children().size() > 0) { level += 1; for (int i = 0; i < parentElement.children().size(); i++) { if (("ul".equals(parentElement.tagName().toLowerCase())) || ("table".equals(parentElement.tagName().toLowerCase()))) { // 整体标签 String html = parentElement.html().replaceAll(" ", "").replaceAll(" ", ""); // 去中英文空格 if (html.contains("首页") || parentElement.id().contains("nav")) { // // System.out.println("----------------------首页Start-----------------------------"); Elements links = parentElement.select("a"); for (Element ele : links) { if (topMenumap.get(level + "," + ele.hashCode()) == null) { topMenumap.put(level + "," + ele.hashCode(), ele); // System.out.println(level + "," + ele.hashCode() + ",---------------" + // ele.html()); // System.out.println("a:" + // ele.attr("abs:href") + ",文本:" + ele.text()); } } // System.out.println("----------------------首页End-----------------------------"); } else { map.put(level + "," + parentElement.hashCode(), parentElement); } } else { getChildElement(parentElement.child(i), level); } } } else { if ("script".equals(parentElement.tagName().toLowerCase())) { return; } if (StringUtils.isNotEmpty(parentElement.html())) { level += 1; map.put(level + "," + parentElement.hashCode(), parentElement); } } }
@Test public void testPrependNewHtml() { Document doc = Jsoup.parse("<div id=1><p>Hello</p></div>"); Element div = doc.getElementById("1"); div.prepend("<p>there</p><p>now</p>"); assertEquals("<p>there</p><p>now</p><p>Hello</p>", TextUtil.stripNewlines(div.html())); // check sibling index (reindexChildren): Elements ps = doc.select("p"); for (int i = 0; i < ps.size(); i++) { assertEquals(i, ps.get(i).siblingIndex); } }
protected String getColumnContent(int column, boolean stripHtml) { try { currentColumn = column; Element element = getElement(column, "small"); if (stripHtml) return element.text().trim(); else return element.html().trim(); } catch (Exception e) { Log.d(getClass().getName(), e.getMessage()); e.printStackTrace(); } return ""; }
private void parseEmoji(int count, JsonWriter jsonWriter) throws IOException { // 从html文件读取网页字符串 InputStream inputStream = getResources().getAssets().open("html/emoji-code-" + count + ".html"); // 根据输入流获取document对象 Document document = Jsoup.parse(inputStream, "utf-8", "http://apps.timwhitlock.info/emoji/tables/unicode"); // 开始解析一层层剥离对象 Element h3 = document.getElementsByClass("category").get(0); // 开始第一个分类 jsonWriter.name(KEY_EMOJI_CATEGORY).value(h3.html()); jsonWriter.name(KEY_EMOJI_ARRAY); jsonWriter.beginArray(); Elements trs = document.getElementsByClass("table-bordered").get(0).child(0).getElementsByTag("tr"); for (int i = 0; i < trs.size(); i++) { // 开始记录emoji jsonWriter.beginObject(); // 获取emoji对应的值 String codeUnicode = trs.get(i).child(7).child(0).html(); String codeUtf8 = trs.get(i).child(8).html(); String name = trs.get(i).child(9).html(); jsonWriter .name(KEY_EMOJI_ICON_URL_ANDROID) .value( "http://apps.timwhitlock.info/static/images/emoji/emoji-android/" + codeUnicode + ".png"); jsonWriter .name(KEY_EMOJI_ICON_URL_APPLE) .value( "http://apps.timwhitlock.info/static/images/emoji/emoji-apple/" + codeUnicode + ".png"); jsonWriter.name(KEY_EMOJI_DESCRIPTION).value(name); jsonWriter.name(KEY_EMOJI_UNICODE).value(codeUnicode); jsonWriter.name(KEY_EMOJI_UTF8).value(codeUtf8); // 结束一个 emoji JSONObject jsonWriter.endObject(); } // 结束一个 emoji category JSONArray jsonWriter.endArray(); // 结束一个 emoji category JSONObject jsonWriter.endObject(); inputStream.close(); }
public boolean contentLongText(String selector, String text) { String shrinkText = Arrays.stream(text.split("\n")).map(v -> v.trim()).collect(Collectors.joining()); Elements es = document().select(selector); for (Element e : es) { String fullText = Arrays.stream(e.html().trim().split("<(br|BR|Br|bR) */?>")) .map(v -> Arrays.stream(v.trim().split("\n")).collect(Collectors.joining())) .collect(Collectors.joining("")); if (fullText.equals(shrinkText)) return wrap(true); } addViolation(String.format("入力されたはずのテキストがDOM要素 '%s' に表示されていません", selector)); return wrap(false); }
@Override protected void select(SSPHandler sspHandler) { ElementSelector es = new SimpleElementSelector(FLASH_CONTENT_CSS_LIKE_QUERY); es.selectElements(sspHandler, decidableElements); es = new SimpleElementSelector(SCRIPT_ELEMENT); es.selectElements(sspHandler, notDecidableElements); Iterator<Element> iter = notDecidableElements.get().iterator(); while (iter.hasNext()) { Element script = iter.next(); if (!StringUtils.contains(script.html(), SWF_EXT)) { iter.remove(); } } }
public void download(Connection aInConnection, Collection<Image> images) throws IOException { aInConnection.url(url); Document lDocument = aInConnection.get(); Element lMain = lDocument.getElementById("main"); Elements lContents = lMain.getElementsByClass("content"); if (lContents.size() == 1) { StringBuilder sb = new StringBuilder(); Element lContent = lContents.first(); collectImages(lContent, images); Elements lLightboxElements = lContent.getElementsByClass("lightbox"); for (Element lLightboxElement : lLightboxElements) { Collection<Node> lImageNodes = extractImageNodes(lLightboxElement); Element lParent = lLightboxElement.parent(); int i = lLightboxElement.siblingIndex(); lParent.insertChildren(i, lImageNodes); lLightboxElement.remove(); } Elements lChildElements = lContent.children(); for (Element lChildElement : lChildElements) { if (lChildElement.hasClass("clear")) { // no more post content break; } if (title == null && lChildElement.tagName().equals("h1")) { // the first h1 header is the title title = lChildElement.html(); } else { if (excerpt == null && lChildElement.tagName().equals("p")) { excerpt = lChildElement.text(); } String lStr = lChildElement.toString(); sb.append(lStr); } } content = sb.toString(); Elements lDateElements = lContent.getElementsByClass("date"); String lHunDate = lDateElements.first().html(); date = new PostDate(lHunDate); } else { System.out.println("More than one content in main section of post page " + toString()); } }
public static String getHighlightedText_math( String text, String color, String apiElement) // for math after merging with recodoc { String highlightBeginning = "<SPAN style=\"BACKGROUND-COLOR: " + color + "\">"; String highlightEnding = "</SPAN>"; Document doc = Jsoup.parse(text); Elements apiElements = doc.select("clt[api=" + apiElement + "]"); for (Element apielement : apiElements) { Document tmp = new Document(""); String[] apis = apielement.text().split("\\."); if (apis.length == 2) apielement.html(highlightBeginning + apis[0] + highlightEnding + "." + apis[1]); else apielement.wrap(highlightBeginning); } // highlight code snippet Elements codesnippets = doc.getElementsByTag("pre"); for (Element codesnippet : codesnippets) { String html = codesnippet.html(); Pattern apielementPattern = Pattern.compile("(?<=\\W)" + apiElement + "(?=\\W)"); Matcher matcher = apielementPattern.matcher(html); codesnippet.html(matcher.replaceAll(highlightBeginning + apiElement + highlightEnding)); } // remove clt tags for display Elements clts = doc.getElementsByTag("clt"); for (Element clt : clts) { clt.unwrap(); // clt.replaceWith(new TextNode(clt.text(), "")); } return doc.html(); }
public static Result text(Element elsPar, String jsoupSelector, boolean exitIfNotFound) { Result res = new Result(); Elements els = elsPar.select(jsoupSelector); if (els == null || els.size() != 1) { res.setRc(RC.NOT_FOUND); if (exitIfNotFound) { res.setRc(RC.ERROR); res.setErrorMessage( "jsoup selector on elements does not match: " + jsoupSelector + "\n" + elsPar.html()); log.error(res.getErrorMessage()); } return res.setContinua(false).setRetStr(""); } return res.setRetStr(els.get(0).text()); }
@Override protected Boolean doInBackground(String... params) { try { Document doc = Jsoup.connect(params[0]).get(); Element body = doc.body(); Elements titleEs = body.select("td.title"); Elements subTitleEs = body.select("td.subtext"); int index = 1; if (!titleEs.isEmpty()) { if (mType == TYPE_REFRESH && mNews.size() > 0) { mNews.clear(); } Iterator<Element> iterator = titleEs.iterator(); Iterator<Element> subIt = subTitleEs.iterator(); NewEntity entity = null; User user = null; while (iterator.hasNext()) { Element e = iterator.next(); if (index % 2 == 0) { Element subE = subIt.next(); Elements aTag = e.select("a"); Elements spanTag = e.select("span.comhead"); Elements subEa = subE.select("a"); user = new User(); user.setId(subEa.get(0).text()); entity = new NewEntity( aTag.get(0).attr("href"), aTag.get(0).text(), spanTag.isEmpty() ? null : spanTag.get(0).text(), subE.html()); entity.setDiscussUrl(subEa.get(1).attr("href")); // Log.i(LOG_TAG, entity.toString()); mNews.add(entity); } index++; } } Elements more = doc.getElementsByAttributeValueStarting("href", "/x?fnid="); if (!more.isEmpty()) { mMoreURLPath = more.get(1).attr("href"); } return true; } catch (IOException e) { Log.e(LOG_TAG, "", e); return false; } }
public Map<String, String> attempt(Element element) { Map<String, String> attributes = new HashMap<String, String>(); for (Entry<String, Matcher> entry : matchers.entrySet()) { if (entry.getValue().test(element)) { attributes.put(entry.getKey(), decode(element.text())); } } for (Entry<String, Matcher> entry : textMatchers.entrySet()) { if (entry.getValue().test(element)) { Node textNode = element.nextSibling(); if (null != textNode) { attributes.put(entry.getKey(), decode(textNode.outerHtml())); } } } for (Entry<String, Matcher> entry : subtextMatchers.entrySet()) { if (entry.getValue().test(element)) { TextNode textNode = element.textNodes().get(0); if (null != textNode) { attributes.put(entry.getKey(), decode(textNode.outerHtml())); } } } for (Entry<String, Matcher> entry : htmlMatchers.entrySet()) { if (entry.getValue().test(element)) { attributes.put(entry.getKey(), element.html()); } } for (Entry<String, Matcher> entry : ptextMatchers.entrySet()) { if (entry.getValue().test(element)) { attributes.put(entry.getKey(), plainTextFormatter.getPlainText(element)); } } for (Entry<String, Object[]> entry : attrMatchers.entrySet()) { Object[] objects = entry.getValue(); Matcher matcher = (Matcher) objects[0]; String attr = (String) objects[1]; if (matcher.test(element)) { attributes.put(entry.getKey(), element.attr(attr)); } } return attributes; }
/** TODO */ public FrequencyImpl(String html) { Document doc = Jsoup.parse(html); Element table = doc.select("table.philologic_table").first(); if (null != table) { for (Element row : table.select("tr.freq_row")) { Element count = row.select("td.freq_value").first(); Element link = row.select("a[href]").first(); _links.add( new FrequencyLinkImpl() .setCount((null == count) ? 0 : Integer.parseInt(count.text())) .setLink((null == link) ? "" : link.attr("href")) .setText((null == link) ? "" : link.html())); } } }
@Test public void testSetHtmlTitle() { Document doc = Jsoup.parse("<html><head id=2><title id=1></title></head></html>"); Element title = doc.getElementById("1"); title.html("good"); assertEquals("good", title.html()); title.html("<i>bad</i>"); assertEquals("<i>bad</i>", title.html()); Element head = doc.getElementById("2"); head.html("<title><i>bad</i></title>"); assertEquals("<title><i>bad</i></title>", head.html()); }