public static void getComic(String arg) { Document doc; try { doc = Jsoup.connect(arg).get(); // String title = doc.title(); // System.out.print("Title: " + title); // Select the img tag in the comic id Elements links = doc.select("#comic img"); System.out.print("\nComic Name : " + links.attr("alt")); System.out.print("\nImage Source : " + links.attr("src") + "\n\n"); URL url = new URL(links.attr("src")); RenderedImage comic = ImageIO.read(url); String baseName = links.attr("alt").replaceAll("\\s", "_"); ImageIO.write(comic, "png", new File("/home/paranoidsp/Pictures/xkcd/" + baseName + ".png")); /* * Unfortunately, the transcript isn't formatted, so I get one * large line of text instead of readable dialogue. * TODO: Fix this. Find a way to get it. * Elements transcript = doc.select("#transcript"); System.out.print("Transcript: \n" + transcript.text()); */ } catch (IOException exp) { exp.printStackTrace(); } }
private void parseFeedItem(String resource) { try { Document doc = Jsoup.parse(resource); Element masthead = doc.select("div.tie-wrapper").first(); Elements feedBoxs = masthead.select("div.tie-box"); for (int i = 0; i < feedBoxs.size(); i++) { FeedItem feedItem = new FeedItem(); Element feedPost = feedBoxs.get(i); Element titleElement = feedPost.select("div.tie-header h2.tie-title a").first(); Element nameElement = feedPost.select("div.tie-content div.tie-user div.user-info p span.user-name").first(); Element sourceElement = feedPost.select("div.tie-content div.tie-user div.user-info p span.user-form").first(); Element timestampElement = feedPost.select("div.tie-content div.tie-user div.user-info p.tie-date").first(); Elements imageElement = feedPost.select("div.tie-content img.st-photo"); Elements contentElements = feedPost.select("div.tie-content p:not(.tie-date):gt(0)"); String title = titleElement.text(); String name = nameElement.text(); String source = sourceElement.text(); String timestamp = timestampElement.text(); String content = ""; for (int j = 0; j < contentElements.size(); j++) { content = content + contentElements.get(j).text() + "\n"; } String image; if (imageElement.attr("src") != "") { image = url + imageElement.attr("src"); } else { image = null; } feedItem.setTitle(title); feedItem.setName(name); feedItem.setPostTime(timestamp); feedItem.setSource(source); feedItem.setImage(image); feedItem.setContent(content); mFeedItems.add(feedItem); } } catch (Exception e) { e.printStackTrace(); } mFeedItemAdapter.notifyDataSetChanged(); }
private void getDatafromJsoup(String url) { // TODO Auto-generated method stub try { Document doc = Jsoup.connect(url).get(); // Elements content = doc.getElementsByClass("cell item"); Elements header = doc.getElementsByClass("topic_content"); Log.e("topic_content", header.text()); title = header.text(); Elements content = doc.getElementsByTag("tbody"); for (Element link : content) { DetailEntity entity = new DetailEntity(); Elements avatar = link.getElementsByTag("img"); { String avaterLink = avatar.attr("src"); if (avaterLink.startsWith("//cdn.")) { entity.setAvater("http:" + avaterLink); } } Elements reply_content = link.getElementsByClass("reply_content"); Log.e("reply_content", reply_content.text()); entity.setReply_count(reply_content.text()); Elements title = link.getElementsByTag("a"); if (title.attr("href").startsWith("/member/")) { Log.e("title", title.text()); entity.setTitle(title.text()); } Log.e( "other", link.getElementsByClass("fade small").text() + link.getElementsByClass("small fade").text()); if (!TextUtils.isEmpty(reply_content.text())) entities.add(entity); } } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } }
private boolean updateDailyNews(Document doc, String dailyTitle, DailyNews dailyNews) throws JSONException { Elements viewMoreElements = doc.getElementsByClass("view-more"); if (viewMoreElements.size() > 1) { dailyNews.setMulti(true); Elements questionTitleElements = doc.getElementsByClass("question-title"); for (int j = 0; j < viewMoreElements.size(); j++) { if (questionTitleElements.get(j).text().length() == 0) { dailyNews.addQuestionTitle(dailyTitle); } else { dailyNews.addQuestionTitle(questionTitleElements.get(j).text()); } Elements viewQuestionElement = viewMoreElements.get(j).select("a"); if (viewQuestionElement.text().equals("查看知乎讨论")) { dailyNews.addQuestionUrl(viewQuestionElement.attr("href")); } else { return false; } } } else if (viewMoreElements.size() == 1) { dailyNews.setMulti(false); Elements viewQuestionElement = viewMoreElements.select("a"); if (viewQuestionElement.text().equals("查看知乎讨论")) { dailyNews.setQuestionUrl(viewQuestionElement.attr("href")); } else { return false; } // Question title is the same with daily title if (doc.getElementsByClass("question-title").text().length() == 0) { dailyNews.setQuestionTitle(dailyTitle); } else { dailyNews.setQuestionTitle(doc.getElementsByClass("question-title").text()); } } else { return false; } return true; }
public static ArrayList<EntryModel> getEpisodeLinks(final String url) { final ArrayList<EntryModel> result = new ArrayList<>(); Thread thread = new Thread( () -> { try { Document document = Jsoup.connect(url).get(); Elements titleElements = document.getElementsByClass("video_option").size() != 0 ? document.getElementsByClass("video_option").first().getElementsByTag("a") : document .getElementsByClass("video_option_act") .first() .getElementsByTag("a"); Elements videoElements = document.getElementsByClass("player_conte"); for (int i = 0; i < videoElements.size(); i++) { String title = titleElements.get(i).text(); String linkUrl = videoElements.get(i).attr("src"); if (!linkUrl.startsWith("http")) { Elements flashElements = videoElements.get(i).select("param[name=flashvars]"); linkUrl = flashElements.attr("value").split("[&;]")[0].split("=")[1]; if (!linkUrl.startsWith("http")) { flashElements = videoElements.get(i).select("[flashvars]"); linkUrl = flashElements.attr("flashvars").split("[&;]")[0].split("=")[1]; } } result.add(new EntryModel(Constants.TYPE_LINK, title, linkUrl)); } } catch (IOException | IndexOutOfBoundsException e) { e.printStackTrace(); } }); thread.start(); try { thread.join(); return result; } catch (InterruptedException | NullPointerException e) { e.printStackTrace(); return null; } }
@Test public void testGenerateManyToOneProperty() throws Exception { Map<String, Object> root = TestHelpers.createInspectionResultWrapper(ENTITY_NAME, MANY_TO_ONE_PROP); Resource<URL> templateResource = resourceFactory.create( getClass().getResource(Deployments.BASE_PACKAGE_PATH + Deployments.SEARCH_FORM_INPUT)); TemplateProcessor processor = processorFactory.fromTemplate(new FreemarkerTemplate(templateResource)); String output = processor.process(root); Document html = Jsoup.parseBodyFragment(output); assertThat(output.trim(), not(equalTo(""))); Elements container = html.select("div.form-group"); assertThat(container, notNullValue()); Elements formInputElement = container.select("div.col-sm-10 > select"); assertThat(formInputElement.attr("id"), equalTo("customer")); assertThat(formInputElement.attr("ng-model"), equalTo("search" + "." + "customer")); }
public static String[] productsIdList_get(String url) throws IOException { String[] arrList = null; Document doc = doc_get(url); if (doc != null) { Elements link_span = doc.select( "body>div.searchwrap.w980>div#bodyRight>#search_result>#plist>#search_table>#productsIdList"); // System.out.println(link_span.attr("value")); arrList = link_span.attr("value").split(","); return arrList; } return arrList; }
protected void parseLoginStep2(SimpleObject context) { String text = ContextUtil.getContent(context); if (text == null) { return; } String phone1 = phoneNo; String password1 = password; String n = StringUtil.subStr("strEnc(username,", ");", text).trim(); if (!StringUtils.isBlank(n)) { String[] stra = n.trim().replaceAll("\'", "").split(","); // pwd, digit, f, s phone1 = executeJsFunc("des/tel_com_des.js", "strEnc", phoneNo, stra[0], stra[1], stra[2]); password1 = executeJsFunc("des/tel_com_des.js", "strEnc", password, stra[0], stra[1], stra[2]); } Document doc = ContextUtil.getDocumentOfContent(context); Elements form = doc.select("form#c2000004"); Request req = new Request(fixedFullUrl(form.attr("action"))); req.setMethod("POST"); req.initNameValuePairs(12); req.addNameValuePairs("lt", form.select("input[name=lt]").attr("value")); req.addNameValuePairs("_eventId", "submit"); req.addNameValuePairs("forbidpass", "null"); req.addNameValuePairs("areaname", areaName); req.addNameValuePairs("password", password1); req.addNameValuePairs("authtype", "c2000004"); req.addNameValuePairs("customFileld01", customField1); req.addNameValuePairs("customFileld02", customField2); req.addNameValuePairs("forbidaccounts", "null"); req.addNameValuePairs("open_no", "c2000004"); req.addNameValuePairs("username", phone1); req.addNameValuePairs("randomId", authCode == null ? "" : authCode); req.setCharset(UAM_CHAR_SET); req.addObjservers( new AbstractProcessorObserver(util, WaringConstaint.ZGDX_3) { @Override public void afterRequest(SimpleObject context) { parseLoginStep3(context); } }); spider.addRequest(req); }
@Override protected ArrayList<object> doInBackground(Integer... integers) { if (android.os.Build.VERSION.SDK_INT > 9) { StrictMode.ThreadPolicy policy = new StrictMode.ThreadPolicy.Builder().permitAll().build(); StrictMode.setThreadPolicy(policy); Document doc; mArrayList = new ArrayList<object>(); try { doc = Jsoup.connect(url).timeout(10 * 1000).get(); Element element = doc.body(); Elements elements = element.getElementsByClass("item"); Log.i(null, "size" + elements.size()); for (Element link : elements) { Elements els_links = link.select("a[href]"); Log.i(null, "links" + els_links.attr("href")); Elements els_linksw = link.select("img"); Element element1 = link.getElementsByClass("item-con").first(); Element element2 = element1.getElementsByTag("p").first(); Log.i(null, "img" + els_linksw.attr("src")); object _util = new object(); _util.setImg(els_linksw.attr("src")); // String str[]=; _util.setTxt(element2.text()); _util.setUrl(els_links.attr("href")); mArrayList.add(_util); } } catch (Exception e) { Log.i(null, "exception" + e); } finally { } } return mArrayList; }
public String parserSessionID(Document document) { Elements logout; String sessionid; int q; logout = document.select("a[href^=logout.php?]"); if (logout.size() == 0) return null; sessionid = logout.attr("href"); q = sessionid.indexOf("session_id="); sessionid = sessionid.substring(q + 11); q = sessionid.indexOf("&"); if (q >= 0) sessionid = sessionid.substring(0, q); return sessionid; }
public static List<CaoImg> getImgData(String url) throws Exception { String response = HttpUtils.getString(url); Document parse = Jsoup.parse(response); Elements allElements = parse.getAllElements(); List<CaoImg> caoImgs = new ArrayList<CaoImg>(); for (int i = 0; i < allElements.size(); i++) { Element element = allElements.get(i); // <table class="wikitable" // style="width: 22em; position: absolute; top: 0px; left: 0px;"> String nodeName = element.nodeName(); String attrClass = element.attr("class"); if (nodeName.equals("table") && "wikitable".equals(attrClass + "")) { String title = element.getElementsByAttribute("title").get(0).attr("title"); Elements imgElement = element.getElementsByTag("img"); String src = imgElement.attr("src"); Elements styleElements = element.getElementsByAttributeValueContaining("style", "font-size"); String otherName = null; String intro = null; if (styleElements.size() == 1) { intro = styleElements.get(0).text(); } else { otherName = styleElements.get(0).text(); intro = styleElements.get(1).text(); } CaoImg caoImg = new CaoImg(); caoImg.setName(title); caoImg.setImg(src); caoImg.setOtherName(otherName); caoImg.setIntro(intro); caoImgs.add(caoImg); } } return caoImgs; }
@Override protected Void doInBackground(Void... params) { try { Document doc = Jsoup.connect(link).ignoreContentType(true).get(); Elements titles = doc.select("span.title2"); title = titles.text(); Elements image = doc.select("img.news-record-thumbnail"); img_src = image.attr("src"); Elements p = doc.select("div.news-block-justify").select("p"); for (Element item : p) { description_text += item.text() + "\n\n"; } Elements date = doc.select("span.title"); DateAdded = date.text(); } catch (Exception ex) { ex.printStackTrace(); } return null; }
@Override public List<ParsedToken> collectSearchResult(Elements elements) { if (elements != null) { Elements productList = elements.select("#productList"); String products = productList.attr("data-products"); String[] temp1 = products.split("\\["); String[] temp2 = temp1[1].split("\\]"); String[] producIdList = temp2[0].split(","); for (String id : producIdList) { String pId = "#".concat(id.trim()); Elements productInfo = elements.select(pId); buildResultList(productInfo); } } return this.productinfoList; }
@Override public HNFeed parseDocument(Document doc) throws Exception { if (doc == null) return new HNFeed(); ArrayList<HNPost> posts = new ArrayList<HNPost>(); // clumsy, but hopefully stable query - first element retrieved is the // top table, we have to skip that: Elements tableRows = doc.select("table tr table tr"); tableRows.remove(0); Elements nextPageURLElements = tableRows.select("a:matches(More)"); String nextPageURL = null; if (nextPageURLElements.size() > 0) nextPageURL = resolveRelativeHNURL(nextPageURLElements.attr("href")); String url = null; String title = null; String author = null; int commentsCount = 0; int points = 0; String urlDomain = null; String postID = null; boolean endParsing = false; for (int row = 0; row < tableRows.size(); row++) { int rowInPost = row % 3; Element rowElement = tableRows.get(row); switch (rowInPost) { case 0: Element e1 = rowElement.select("tr > td:eq(2) > a").first(); if (e1 == null) { endParsing = true; break; } title = e1.text(); url = resolveRelativeHNURL(e1.attr("href")); urlDomain = getDomainName(url); break; case 1: points = getIntValueFollowedBySuffix(rowElement.select("tr > td:eq(1) > span").text(), " p"); author = rowElement.select("tr > td:eq(1) > a[href*=user]").text(); Element e2 = rowElement.select("tr > td:eq(1) > a[href*=item]").first(); if (e2 != null) { commentsCount = getIntValueFollowedBySuffix(e2.text(), " c"); if (commentsCount == BaseHTMLParser.UNDEFINED && e2.text().contains("discuss")) commentsCount = 0; postID = getStringValuePrefixedByPrefix(e2.attr("href"), "id="); } else commentsCount = BaseHTMLParser.UNDEFINED; posts.add(new HNPost(url, title, urlDomain, author, postID, commentsCount, points)); break; default: break; } if (endParsing) break; } return new HNFeed(posts, nextPageURL); }
@SuppressWarnings("unchecked") protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { // 0.init String captchaURL = null; String captchaImage = null; BasicCookieStore cookieStore = new BasicCookieStore(); // CloseableHttpClient httpClient = HttpClients.createDefault(); CloseableHttpClient httpClient = HttpClients.custom().setDefaultCookieStore(cookieStore).build(); // 1.Send Get request header to server. // Get the response Html page. System.out.println("==========Send Request to e-can server=========="); HttpGet httpGet = new HttpGet("https://www.e-can.com.tw/reservationUNMember_online.aspx"); httpGet.addHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"); httpGet.addHeader("Accept-Encoding", "gzip, deflate"); httpGet.addHeader("Accept-Language", "zh-TW,zh;q=0.8,en-US;q=0.5,en;q=0.3"); httpGet.addHeader("Connection", "Keep-Alive"); httpGet.addHeader("Host", "www.e-can.com.tw"); httpGet.addHeader("User-Agent", "Mozilla"); CloseableHttpResponse resp = httpClient.execute(httpGet); System.out.println(); // show server response status > GET 200 OK System.out.println(resp.getStatusLine()); for (Header h : resp.getAllHeaders()) { System.out.println(h); } System.out.println("**********End of Headers********** \n\n"); HttpEntity entity = resp.getEntity(); // show entity // System.out.println("entity="+entity); String html = EntityUtils.toString(resp.getEntity()); // show html page // System.out.println(html); // 2.Use Jsoup to parse html page. // Select cssString to get captchaURL and captchaKey source. Document htmlDoc = Jsoup.parse(html); Elements elementEventTarget = htmlDoc.select("#__EVENTTARGET"); Elements elementEventArgument = htmlDoc.select("#__EVENTARGUMENT"); Elements elementViewState = htmlDoc.select("#__VIEWSTATE"); Elements elementViewStateGenerrator = htmlDoc.select("#__VIEWSTATEGENERATOR"); Elements elementddlGetdate = htmlDoc.select("#ddlGetdate > option"); // info for post later String __EVENTTARGET = elementEventTarget.val(); String __EVENTARGUMENT = elementEventArgument.val(); String __VIEWSTATE = elementViewState.val(); String __VIEWSTATEGENERATOR = elementViewStateGenerrator.val(); int count = 0; JSONObject joOption = new JSONObject(); for (Element e : elementddlGetdate) { joOption.put(count++, e.toString()); } System.out.println("joOption = " + joOption); Elements elementCaptcha = htmlDoc.select("#captcha"); System.out.println(elementCaptcha.attr("src")); captchaURL = "https://www.e-can.com.tw/" + elementCaptcha.attr("src"); // show URL System.out.println("captchaURL=" + captchaURL); // 3.Send GET request to get the captchaImage source. // Encode source to base64 String. System.out.println("==========Send request to e-can for captcha image=========="); httpGet = new HttpGet(captchaURL); httpGet.addHeader("Referer", "https://www.e-can.com.tw/reservationUNMember_online.aspx"); httpGet.addHeader("Accept", "image/png,image/*;q=0.8,*/*;q=0.5"); httpGet.addHeader("Accept-Encoding", "gzip, deflate"); httpGet.addHeader("Accept-Language", "zh-TW,zh;q=0.8,en-US;q=0.5,en;q=0.3"); httpGet.addHeader("Connection", "Keep-Alive"); httpGet.addHeader("Host", "www.e-can.com.tw"); httpGet.addHeader("User-Agent", "Mozilla"); resp = httpClient.execute(httpGet); System.out.println(); List<Cookie> cookies = cookieStore.getCookies(); if (cookies.isEmpty()) { System.out.println("None"); } else { for (int i = 0; i < cookies.size(); i++) { System.out.println("- " + cookies.get(i).toString()); } } System.out.println("cookieName= " + cookies.get(0).getName()); System.out.println("cookieValue= " + cookies.get(0).getValue()); System.out.println(); System.out.println(resp.getStatusLine()); for (Header h : resp.getAllHeaders()) { System.out.println(h); } System.out.println("**********End of Headers********** \n\n"); entity = resp.getEntity(); InputStream instream = entity.getContent(); ByteArrayOutputStream baos = new ByteArrayOutputStream(); byte[] bytes = new byte[instream.available()]; int reads = instream.read(); while (reads != -1) { baos.write(reads); reads = instream.read(); } bytes = baos.toByteArray(); captchaImage = "data:image/png;base64," + new BASE64Encoder().encode(bytes); // show captchaImage of base64 code // System.out.println(captchaImage); EntityUtils.consume(entity); // 4.Use Json format to wrap url, key and source string. // Then send response to ajax request from index.jsp JSONObject jo = new JSONObject(); jo.put("__EVENTTARGET", __EVENTTARGET); jo.put("__EVENTARGUMENT", __EVENTARGUMENT); jo.put("__VIEWSTATE", __VIEWSTATE); jo.put("__VIEWSTATEGENERATOR", __VIEWSTATEGENERATOR); jo.put("captchaURL", captchaURL); jo.put("captchaImage", captchaImage); jo.put(cookies.get(0).getName(), cookies.get(0).getValue()); String bothJson = "[" + jo + "," + joOption + "]"; response.setContentType("application/json"); response.setCharacterEncoding("utf-8"); PrintWriter out = response.getWriter(); out.print(bothJson); out.flush(); }
@Override public SNComments parseDocument(Document doc) throws Exception { SNComments comments = new SNComments(); if (doc == null) { return comments; } Elements tableRows = doc.body().select("table tr table tr"); if (tableRows != null && tableRows.size() > 0) { tableRows.remove(0); // 获取下一页链接 Elements moreURLElements = tableRows.select("a:matches(More)"); String moreURL = null; if (moreURLElements.size() > 0) { moreURL = resolveRelativeSNURL(moreURLElements.attr("href")); } comments.setMoreURL(moreURL); String linkURL = null; String parentURL = null; String discussURL = null; String text = null; String created = null; SNUser user = null; String artistTitle = null; // 文章标题 String voteURL = null; for (int row = 0; row < tableRows.size(); row++) { int rowInPost = row % 2; Element rowElement = tableRows.get(row); if (rowInPost == 0) { Element textElement = rowElement.select("tr > td:eq(1) > span").first(); if (textElement == null) { break; } text = textElement.text(); user = new SNUser(); Element spanElement = rowElement.select("tr > td:eq(1) > div > span").first(); created = getCreateAt(spanElement.text()); Elements aElements = spanElement.select("span > a"); if (aElements != null && aElements.size() >= 4) { int size = aElements.size(); Element anthorURLElement = aElements.first(); user.setId(anthorURLElement.text()); Element linkURLElement = aElements.get(1); linkURL = resolveRelativeSNURL(linkURLElement.attr("href")); Element parentURLElement = aElements.get(2); parentURL = resolveRelativeSNURL(parentURLElement.attr("href")); Element artistAElement = aElements.last(); discussURL = resolveRelativeSNURL(artistAElement.attr("href")); artistTitle = artistAElement.text(); if (size == 6) { // TODO edit delete } } Element voteAElement = rowElement.select("tr > td:eq(0) a").first(); if (voteAElement != null) { // 登录用户的评论没有url voteURL = resolveRelativeSNURL(voteAElement.attr("href")); } comments.addComment( new SNComment( linkURL, parentURL, discussURL, text, created, user, artistTitle, voteURL, null)); } } } return comments; }
/* * Downloads one painting */ public void paintingScraper(String paintingLink) throws IOException { Document doc; try { doc = Jsoup.connect(paintingLink).get(); } catch (IOException e) { doc = null; e.printStackTrace(); } Elements imgContainer = doc.getElementsByClass("download"); String imgURL = imgContainer.attr("href"); for (int i = 0; i < imgURL.length(); i++) { if (Character.isWhitespace(imgURL.charAt(i))) { return; } } /* * The following parses the single image for * Title * Artist * Date * Culture * To be added to the image file properties */ Elements imageInfo = doc.getElementsByClass("tombstone-container"); String allInfo = imageInfo.select(".tombstone").text(); String artist = null; int endArtist = 0; String date = null; int startDate = 0; int endDate = 0; String culture = null; int startCulture = 0; int endCulture = 0; /* * Parses the info for the Title */ String title = imageInfo.select("h2").text(); title = title.replaceAll("[^A-Za-z0-9 ]", "").trim(); /* * Parses the info for the Artist */ for (int i = 9; i < allInfo.length(); i++) { if (!allInfo.contains("Artist:")) { endArtist = 0; break; } if (allInfo.substring(8, i).contains(": ")) { endArtist = i - 2; artist = allInfo.substring(8, endArtist).trim(); if (artist.contains("(")) { for (int h = 0; h < artist.length(); h++) { if (artist.substring(0, h).contains("(")) { endArtist = h - 1; artist = artist.substring(0, h - 1).trim(); break; } } } else { break; } } } /* * parses the info for the Date */ for (int j = endArtist; j < allInfo.length(); j++) { if (allInfo.substring(endArtist, j).contains("Date: ")) { startDate = j; for (int k = startDate; k < allInfo.length(); k++) { if (allInfo.substring(startDate, k).contains("Culture: ")) { endDate = k - 10; date = allInfo.substring(startDate, endDate); break; } if (allInfo.substring(startDate, k).contains("Medium: ")) { endDate = k - 9; date = allInfo.substring(startDate, endDate); break; } if (allInfo.substring(startDate, k).contains("Dimensions: ")) { endDate = k - 13; date = allInfo.substring(startDate, endDate); break; } } break; } } /* * Parses the info for the culture */ for (int l = endDate; l < allInfo.length(); l++) { if (allInfo.substring(endDate, l).contains("Culture: ")) { startCulture = l; for (int m = startCulture; m < allInfo.length(); m++) { if (allInfo.substring(startCulture, m).contains("Medium")) { endCulture = m - 6; culture = allInfo.substring(startCulture, endCulture).trim(); break; } if (allInfo.substring(startCulture, m).contains("Geography")) { endCulture = m - 10; culture = allInfo.substring(startCulture, endCulture).trim(); break; } } break; } } if (imgURL.equals(null) || imgURL.isEmpty()) { System.out.println("No info!"); System.out.println("-----------------------------------------------"); return; } else { System.out.println("artist: " + artist); System.out.println("title: " + title); System.out.println("date: " + date); System.out.println("culture: " + culture); System.out.println("url: " + imgURL); } String artistFile = null; String titleFile = null; if (artist == null) { artist = "none"; System.out.println("THIS SHOULD PRINT"); } if (title == null) { title = "none"; } artistFile = artist.replaceAll("[^A-Za-z0-9]", ""); System.out.println("artistfIle: " + artistFile); titleFile = title.replaceAll("[^A-Za-z0-9]", ""); System.out.println("titlefile: " + titleFile); /* * The following copies the file into my directory */ String destinationFile = folderPath + artistFile + "-" + titleFile + ".jpg"; System.out.println(destinationFile); System.out.println("-----------------------------------------------"); if (imgURL != null) { URL url = new URL(imgURL); InputStream in = url.openStream(); OutputStream out = new FileOutputStream(destinationFile); byte[] b = new byte[2048]; int length; while ((length = in.read(b)) != -1) { out.write(b, 0, length); } in.close(); out.close(); } }
private void setListElements( Element ele, Node rightListNode, Session session, String locale, Map<String, String> urlMap) { try { String ownPdfText = ""; String pdfIcon = ""; String pdfSize = ""; Elements h2Ele = ele.getElementsByTag("h2"); Elements h3Ele = ele.getElementsByTag("h3"); Elements ulEle = ele.getElementsByTag("ul"); String h2Text = null; String h3Text = null; // start of handling title of list component if (!h2Ele.isEmpty()) { h2Text = h2Ele.first().text(); rightListNode.setProperty("title", h2Text); if (h2Ele.size() > 1) { sb.append(Constants.MISMATCH_IN_RIGHT_LIST_COUNT); } } else { sb.append(Constants.LIST_HEADING_COMPONENT_NOT_FOUND); } // end of handling title of list component // start of handling title of list component NodeIterator h3Iterator = rightListNode.hasNode("element_subtitle_0") ? rightListNode.getNodes("element_subtitle*") : null; if (h3Iterator != null) { if (!h3Ele.isEmpty()) { int eleSize = h3Ele.size(); int nodeSize = (int) h3Iterator.getSize(); Node h3nodeList; if (eleSize == nodeSize) { for (Element h3Itr : h3Ele) { h3nodeList = (Node) h3Iterator.next(); h3Text = h3Itr.text(); h3nodeList.setProperty("subtitle", h3Text); } } if (nodeSize < eleSize) { for (Element h3Itr : h3Ele) { if (h3Iterator.hasNext()) { h3nodeList = (Node) h3Iterator.next(); h3Text = h3Itr.text(); h3nodeList.setProperty("subtitle", h3Text); } } sb.append( Constants.MISMATCH_IN_LIST_ELEMENT + nodeSize + Constants.SPOTLIGHT_ELEMENT_COUNT + eleSize); } if (nodeSize > eleSize) { for (Element h3Itr : h3Ele) { h3nodeList = (Node) h3Iterator.next(); h3Text = h3Itr.text(); h3nodeList.setProperty("subtitle", h3Text); } sb.append( Constants.LIST_ELEMENTS_COUNT_MISMATCH + nodeSize + Constants.SPOTLIGHT_ELEMENT_COUNT + eleSize); } } else { sb.append(Constants.LIST_HEADING_COMPONENT_NOT_FOUND); log.debug("h3 text is not avalable"); } } else { if (!h3Ele.isEmpty()) { log.debug("subtitle node doesnot exist but ele exist"); sb.append(Constants.LIST_COMPONENT_NOT_FOUND); } } // end of handling title of list component // Element List NodeIterator ulNodeIterator = rightListNode.hasNode("element_list_0") ? rightListNode.getNodes("element_list*") : null; if (ulNodeIterator != null) { Node ulnodeList; for (Element ulItr : ulEle) { if (ulNodeIterator.hasNext()) { ulnodeList = (Node) ulNodeIterator.next(); Elements list = ulItr.getElementsByTag("li"); List<String> listAdd = new ArrayList<String>(); for (Element li : list) { pdfIcon = ""; pdfSize = ""; boolean openNewWindow = false; // pdf content try { ownPdfText = li.ownText(); if (StringUtils.isNotEmpty(ownPdfText)) { log.debug("OWn text is:" + ownPdfText); if (ownPdfText.toLowerCase().contains("pdf") || ownPdfText.toLowerCase().contains("video")) { pdfIcon = "pdf"; if (ownPdfText.toLowerCase().contains("video")) { pdfIcon = "video"; } int i = 0; for (; i < ownPdfText.length(); i++) { char character = ownPdfText.charAt(i); boolean isDigit = Character.isDigit(character); if (isDigit) { break; } } pdfSize = ownPdfText.substring(i, ownPdfText.length() - 1); pdfSize = pdfSize.replace(")", ""); pdfSize = pdfSize.trim(); } } } catch (Exception e) { sb.append(Constants.Exception_BY_SPECIAL_CHARACTER); log.error("Exception : ", e); } // fix for new win icon Elements newwinCheck = li.select("span.newwin"); if (!newwinCheck.isEmpty()) { log.debug("extra new win icon found"); sb.append(Constants.EXTRA_ICON_FOUND_IN_LIST); } // check for the lock icon Elements imgInList = li.getElementsByTag("img"); if (!imgInList.isEmpty()) { String altImg = imgInList.attr("alt"); if (altImg.equals("lock_icon")) { log.debug("lock icon found in the list"); sb.append(Constants.EXTRA_LOCK_IMG_FOUND_IN_LIST); } } if (!li.getElementsByTag("a").isEmpty()) { Element a = li.getElementsByTag("a").first(); String aHref = a.absUrl("href"); if (StringUtil.isBlank(aHref)) { aHref = a.attr("href"); } // Start extracting valid href log.debug("Before anchorHref" + a.absUrl("href") + "\n"); String anchorHref = FrameworkUtils.getLocaleReference(aHref, urlMap, locale, sb); log.debug("after anchorHref" + anchorHref + "\n"); // End extracting valid href JSONObject obj = new JSONObject(); obj.put("linktext", a.text()); obj.put("linkurl", anchorHref); obj.put("icon", pdfIcon); obj.put("size", pdfSize); obj.put("description", ""); obj.put("openInNewWindow", openNewWindow); listAdd.add(obj.toString()); } } ulnodeList.setProperty("listitems", listAdd.toArray(new String[listAdd.size()])); } } if (ulNodeIterator.hasNext()) { sb.append(Constants.MISMATCH_IN_RIGHT_LIST_COUNT); } } else { sb.append(Constants.NO_LIST_NODES_FOUND); } // End of Element List } catch (Exception e) { sb.append(Constants.UNABLE_TO_MIGRATE_LIST_COMPONENT); log.error("Exception : ", e); } }
public int crawBBWC(int ok, int fail, String url) { try { Document doc = Jsoup.connect(url).userAgent(UA).timeout(3000).get(); Elements frame = doc.select("iframe#verticalContent"); if (frame.size() > 0) { url = frame.attr("src"); } doc = Jsoup.connect(url).userAgent(UA).timeout(3000).get(); resultTitle = resultCont = ""; // 处理图片链接 Pattern p = Pattern.compile("issue_\\d+/articles/\\d+"); Matcher m = p.matcher(url); if (m.find()) { String pre = "http://s4.cdn.bb.bbwc.cn/" + m.group(); Elements imgs = doc.select("img"); if (imgs.size() > 0) { for (Element img : imgs) { String raw = img.attr("data-src"); raw = raw.replace("uploadfile", pre); img.attr("src", raw); } } } // 开始提取 Elements eletitle = doc.select(this.title_rex), eleauth = null, elecont = doc.select(this.cont_rex), eleextra = null; if (Constant.DEBUG) FileUtils.writeFile(doc.html(), "clip"); if (!auth_rex.equals("")) eleauth = doc.select(this.auth_rex); if (!extra_rex.equals("")) eleextra = doc.select(this.extra_rex); if (eletitle.size() > 0) { resultTitle = eletitle.get(0).html(); if (elecont.size() > 0) { elecont = addStyleForTable(elecont); resultCont = elecont.get(0).html(); } if (!auth_rex.equals("")) { if (eleauth.size() > 0) resultCont = "<p>" + eleauth.get(0).html() + "</p>" + resultCont; } if (!extra_rex.equals("")) { eleextra = addStyleForTable(eleextra); if (eleextra.size() > 0) resultCont = resultCont + eleextra.get(0).html(); } return ok; } else { MLog.e("", "没有匹配到title"); return fail; } } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } return fail; }
public static String formHash(String html) { Document indexDoc = Jsoup.parse(html); Elements select = indexDoc.select("input[name=formhash]"); return select.attr("value"); }