@Test public void parentlessToString() { Document doc = Jsoup.parse("<img src='foo'>"); Element img = doc.select("img").first(); assertEquals("<img src=\"foo\">", img.toString()); img.remove(); // lost its parent assertEquals("<img src=\"foo\">", img.toString()); }
private static String getTrailer(Movie movie) { String trailerLink = ""; if (Integer.valueOf(movie.getMovieYear()) < 1990) { trailerLink = "null"; } else { trailerLink += "http://www.youtube.com"; String link = formatYoutubeString(movie.getMovieName()); try { Document d = Jsoup.connect("http://www.youtube.com/" + link).get(); Element e = d.body(); String html = e.toString(); String linkDiv = ""; int max = html.indexOf("class=\"yt-lockup-title \"><a href=\"") + 100; for (int i = html.indexOf("class=\"yt-lockup-title \"><a href=\""); i < max; i++) { linkDiv += html.charAt(i); } for (int i = linkDiv.indexOf("<a href=\"") + 9; i < linkDiv.indexOf("class=\"yt-uix-sessionlink") - 2; i++) { trailerLink += linkDiv.charAt(i); } } catch (Exception e) { System.out.println(e.toString()); } } return trailerLink; }
/** * getMovieActors parses through the movie's page html and returns three actors. * * @author defq0n * @param pageLink is the extended imdb url for the movie page. * @return movieActors String containing three actors. */ private static String[] getMovieActors(String pageLink) { String[] movieActors = {"", "", ""}; try { Document d = Jsoup.connect("http://imdb.com" + pageLink).get(); Element e = d.body(); String html = e.toString(); String actorsDiv = ""; for (int i = html.indexOf("<h4 class=\"inline\">Stars:</h4>") + 30; i < html.indexOf("See full cast and crew"); i++) { actorsDiv += html.charAt(i); } String tempDiv = actorsDiv; for (int i = 0; i < 3; i++) { // we will get the first three top actors String actor = ""; String t = "itemprop=\"url\"><span class=\"itemprop\" itemprop=\"name\">"; for (int j = tempDiv.indexOf(t) + t.length(); j < tempDiv.indexOf("</span></a>"); j++) { actor += tempDiv.charAt(j); } movieActors[i] = actor; tempDiv = ""; for (int j = actorsDiv.indexOf(actor + "</span>") + actor.length() + 7; j < actorsDiv.length(); j++) { tempDiv += actorsDiv.charAt(j); } } } catch (Exception e) { System.out.println(e.toString()); } return movieActors; }
public static void readHead() { String url = "http://www.2177s.com"; try { Document doc = Jsoup.connect(url).timeout(10000).get(); String title = doc.title(); System.out.printf("title:%s\n", title); // Elements eles = doc.select("meta[name~=(?i)keywords|(?i)description]"); Elements eles = doc.select("meta"); System.out.println(eles.size()); for (Element ele : eles) { if (StringUtils.containsIgnoreCase(url, title)) ; if (ele.toString().matches(".*(?i)keywords.*")) { System.out.println(ele.attr("content")); } // System.out.println(ele.attr("content")); } // Elements eles = doc.getElementsByTag("meta"); // for (Element ele : eles) { // System.out.printf("keys:%s\n", ele.attr("keywords")); // System.out.printf("desc:%s\n", ele.attr("description")); // System.out.println("----------------"); // } doc = null; } catch (Exception e) { e.printStackTrace(); } }
private URL getGalleryFromImage(URL url) throws IOException { Document doc = Http.url(url).get(); for (Element link : doc.select("a[href~=^gallery\\.php.*$]")) { logger.info("LINK: " + link.toString()); if (link.hasAttr("href") && link.attr("href").contains("gallery.php")) { url = new URL("http://imagearn.com/" + link.attr("href")); logger.info("[!] Found gallery from given link: " + url); return url; } } throw new IOException("Failed to find gallery at URL " + url); }
public void download(Connection aInConnection, Collection<Image> images) throws IOException { aInConnection.url(url); Document lDocument = aInConnection.get(); Element lMain = lDocument.getElementById("main"); Elements lContents = lMain.getElementsByClass("content"); if (lContents.size() == 1) { StringBuilder sb = new StringBuilder(); Element lContent = lContents.first(); collectImages(lContent, images); Elements lLightboxElements = lContent.getElementsByClass("lightbox"); for (Element lLightboxElement : lLightboxElements) { Collection<Node> lImageNodes = extractImageNodes(lLightboxElement); Element lParent = lLightboxElement.parent(); int i = lLightboxElement.siblingIndex(); lParent.insertChildren(i, lImageNodes); lLightboxElement.remove(); } Elements lChildElements = lContent.children(); for (Element lChildElement : lChildElements) { if (lChildElement.hasClass("clear")) { // no more post content break; } if (title == null && lChildElement.tagName().equals("h1")) { // the first h1 header is the title title = lChildElement.html(); } else { if (excerpt == null && lChildElement.tagName().equals("p")) { excerpt = lChildElement.text(); } String lStr = lChildElement.toString(); sb.append(lStr); } } content = sb.toString(); Elements lDateElements = lContent.getElementsByClass("date"); String lHunDate = lDateElements.first().html(); date = new PostDate(lHunDate); } else { System.out.println("More than one content in main section of post page " + toString()); } }
/** * getMovieDescription parses through the movie's page html and returns the poster url link. * * @author defq0n * @param pageLink is the extended imdb url for the movie page. * @return posterLink String containing the poster url link. */ private static String getPosterLink(String pageLink) { String posterLink = ""; try { Document d = Jsoup.connect("http://imdb.com" + pageLink).get(); Element e = d.body(); String html = e.toString(); String posterDiv = ""; for (int i = html.indexOf("class=\"image\">") + 14; i < html.indexOf("<div class=\"pro-title-link text-center\">"); i++) { posterDiv += html.charAt(i); } for (int i = posterDiv.indexOf("src=\"") + 5; i < posterDiv.indexOf(".jpg\"") + 4; i++) { posterLink += posterDiv.charAt(i); } } catch (Exception e) { System.out.println(e.toString()); } return posterLink; }
/** * getMovieDescription parses through the movie's page html and returns the description. * * @author defq0n * @param pageLink is the extended imdb url for the movie page. * @return movieDescription String containing the description */ private static String getMovieDescription(String pageLink) { String movieDescription = ""; try { Document d = Jsoup.connect("http://imdb.com" + pageLink).get(); Element e = d.body(); String html = e.toString(); String descriptionDiv = ""; for (int i = html.indexOf("description\">") + 13; i < html.indexOf("<div class=\"txt-block\" itemprop=\"director\""); i++) { descriptionDiv += html.charAt(i); } for (int i = 0; i < descriptionDiv.indexOf("</p>"); i++) { movieDescription += descriptionDiv.charAt(i); } } catch (Exception e) { System.out.println(e.toString()); } return movieDescription; }
@Override void parseUrl(String url) throws IOException { doc = Jsoup.connect(url).timeout(0).get(); Elements elements = doc.select("table") .select("tbody") .select("tr") .select("td") .select("table") .select("tbody") .select("tr") .select("td") .select("a[href]"); for (Element e : elements) { if (!e.toString().contains("cart")) myProdLinks.add(e.attr("abs:href")); } // System.out.println(myProdLinks); // System.out.println(myProdLinks.size()); }
/** * parseMoviesHTMl(String) takes a formatted title query and returns a string array of movie HTML * source code. Maximum of 10 movies. //TODO Doesn't always work but works for most cases, HTML * tends to change for some results, will be changed when the reason is found out. * * @author defq0n * @param titleQuery Formatted title query. * @return moviesHTMl String array of each HTML source. */ public static String[] parseMoviesHTML(String titleQuery) { String moviesHTML[] = { "", "", "", "", "", "", "", "", "", "" }; // List of movies in HTML, limied to 10, initalized for easy string addition try { // Get the document using JSoup Document d = Jsoup.connect("http://www.imdb.com/find?ref_=nv_sr_fn&q=" + titleQuery + "&s=all").get(); // Get the HTML body element Element e = d.body(); // Declare Variables String xhtml = e.toString(); // HTML of body element String tbody = ""; // HTML of tbody element // get tbody html and store in #tbody for (int i = xhtml.indexOf("<tbody>"); i < xhtml.indexOf("</tbody>"); i++) { tbody += xhtml.charAt(i); } // loop over tbody to find how many movies there are and store the results in #moviesHTML int counter = 0; // counter for while loop while (counter < 10) { // hard code 10 because thats the maximum amount in #moviesHTML for (int i = tbody.indexOf("<td class=\"result_text\">") + 24; i < tbody.indexOf(") </td>") + 7; i++) { moviesHTML[counter] += tbody.charAt(i); } // now we have to reset tbody for the next String temp_tbody = ""; for (int i = tbody.indexOf(") </td>") + 7; i < tbody.length(); i++) { temp_tbody += tbody.charAt(i); } // set tbody to the temporary one to get rid of the previous result tbody = temp_tbody; // index counter for next movie counter++; } } catch (Exception e) { System.out.println(e.getStackTrace()[1]); } return moviesHTML; }
private static String getGenre(String pageLink) { String genre = ""; try { Document d = Jsoup.connect("http://imdb.com" + pageLink).get(); Element e = d.body(); String html = e.toString(); String genreDiv = ""; for (int i = html.indexOf("itemprop=\"genre\""); i < html.indexOf("itemprop=\"description\""); i++) { genreDiv += html.charAt(i); } for (int i = genreDiv.indexOf("itemprop=\"genre\"") + 17; i < genreDiv.indexOf("</span>"); i++) { genre += genreDiv.charAt(i); } } catch (Exception e) { System.out.println(e.toString()); } return genre; }
@SuppressWarnings("unchecked") protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { // 0.init String captchaURL = null; String captchaImage = null; BasicCookieStore cookieStore = new BasicCookieStore(); // CloseableHttpClient httpClient = HttpClients.createDefault(); CloseableHttpClient httpClient = HttpClients.custom().setDefaultCookieStore(cookieStore).build(); // 1.Send Get request header to server. // Get the response Html page. System.out.println("==========Send Request to e-can server=========="); HttpGet httpGet = new HttpGet("https://www.e-can.com.tw/reservationUNMember_online.aspx"); httpGet.addHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"); httpGet.addHeader("Accept-Encoding", "gzip, deflate"); httpGet.addHeader("Accept-Language", "zh-TW,zh;q=0.8,en-US;q=0.5,en;q=0.3"); httpGet.addHeader("Connection", "Keep-Alive"); httpGet.addHeader("Host", "www.e-can.com.tw"); httpGet.addHeader("User-Agent", "Mozilla"); CloseableHttpResponse resp = httpClient.execute(httpGet); System.out.println(); // show server response status > GET 200 OK System.out.println(resp.getStatusLine()); for (Header h : resp.getAllHeaders()) { System.out.println(h); } System.out.println("**********End of Headers********** \n\n"); HttpEntity entity = resp.getEntity(); // show entity // System.out.println("entity="+entity); String html = EntityUtils.toString(resp.getEntity()); // show html page // System.out.println(html); // 2.Use Jsoup to parse html page. // Select cssString to get captchaURL and captchaKey source. Document htmlDoc = Jsoup.parse(html); Elements elementEventTarget = htmlDoc.select("#__EVENTTARGET"); Elements elementEventArgument = htmlDoc.select("#__EVENTARGUMENT"); Elements elementViewState = htmlDoc.select("#__VIEWSTATE"); Elements elementViewStateGenerrator = htmlDoc.select("#__VIEWSTATEGENERATOR"); Elements elementddlGetdate = htmlDoc.select("#ddlGetdate > option"); // info for post later String __EVENTTARGET = elementEventTarget.val(); String __EVENTARGUMENT = elementEventArgument.val(); String __VIEWSTATE = elementViewState.val(); String __VIEWSTATEGENERATOR = elementViewStateGenerrator.val(); int count = 0; JSONObject joOption = new JSONObject(); for (Element e : elementddlGetdate) { joOption.put(count++, e.toString()); } System.out.println("joOption = " + joOption); Elements elementCaptcha = htmlDoc.select("#captcha"); System.out.println(elementCaptcha.attr("src")); captchaURL = "https://www.e-can.com.tw/" + elementCaptcha.attr("src"); // show URL System.out.println("captchaURL=" + captchaURL); // 3.Send GET request to get the captchaImage source. // Encode source to base64 String. System.out.println("==========Send request to e-can for captcha image=========="); httpGet = new HttpGet(captchaURL); httpGet.addHeader("Referer", "https://www.e-can.com.tw/reservationUNMember_online.aspx"); httpGet.addHeader("Accept", "image/png,image/*;q=0.8,*/*;q=0.5"); httpGet.addHeader("Accept-Encoding", "gzip, deflate"); httpGet.addHeader("Accept-Language", "zh-TW,zh;q=0.8,en-US;q=0.5,en;q=0.3"); httpGet.addHeader("Connection", "Keep-Alive"); httpGet.addHeader("Host", "www.e-can.com.tw"); httpGet.addHeader("User-Agent", "Mozilla"); resp = httpClient.execute(httpGet); System.out.println(); List<Cookie> cookies = cookieStore.getCookies(); if (cookies.isEmpty()) { System.out.println("None"); } else { for (int i = 0; i < cookies.size(); i++) { System.out.println("- " + cookies.get(i).toString()); } } System.out.println("cookieName= " + cookies.get(0).getName()); System.out.println("cookieValue= " + cookies.get(0).getValue()); System.out.println(); System.out.println(resp.getStatusLine()); for (Header h : resp.getAllHeaders()) { System.out.println(h); } System.out.println("**********End of Headers********** \n\n"); entity = resp.getEntity(); InputStream instream = entity.getContent(); ByteArrayOutputStream baos = new ByteArrayOutputStream(); byte[] bytes = new byte[instream.available()]; int reads = instream.read(); while (reads != -1) { baos.write(reads); reads = instream.read(); } bytes = baos.toByteArray(); captchaImage = "data:image/png;base64," + new BASE64Encoder().encode(bytes); // show captchaImage of base64 code // System.out.println(captchaImage); EntityUtils.consume(entity); // 4.Use Json format to wrap url, key and source string. // Then send response to ajax request from index.jsp JSONObject jo = new JSONObject(); jo.put("__EVENTTARGET", __EVENTTARGET); jo.put("__EVENTARGUMENT", __EVENTARGUMENT); jo.put("__VIEWSTATE", __VIEWSTATE); jo.put("__VIEWSTATEGENERATOR", __VIEWSTATEGENERATOR); jo.put("captchaURL", captchaURL); jo.put("captchaImage", captchaImage); jo.put(cookies.get(0).getName(), cookies.get(0).getValue()); String bothJson = "[" + jo + "," + joOption + "]"; response.setContentType("application/json"); response.setCharacterEncoding("utf-8"); PrintWriter out = response.getWriter(); out.print(bothJson); out.flush(); }
public ArrayList<JuickMessage> parseWebMessageListPure(String htmlStr) { ArrayList<JuickMessage> retval = new ArrayList<JuickMessage>(); Document parsed = Jsoup.parse(htmlStr); Elements posts = parsed.select("div"); ISimpleDateFormat sdf; ISimpleDateFormat sdf2; sdf = DevJuickComMessages.sdftz.createSDF("yyyy dd MMM HH:mm", "en", "US", "UTC"); sdf2 = DevJuickComMessages.sdftz.createSDF("yyyy dd MMM HH:mm", "ru", "RU", "UTC"); Calendar cal = Calendar.getInstance(); int currentYear = cal.get(Calendar.YEAR); for (Element post : posts) { String postClass = post.attr("class"); if (postClass.equals("post") || postClass.startsWith("post ")) { PointMessage message = new PointMessage(); message.User = new JuickUser(); message.User.UName = post.select("div[class=info] > a > img").attr("alt"); if (message.User.UName.length() == 0) { message.User.UName = post.select("div[class=author] > a").text(); } String dataId = post.attr("data-id"); String dataCommentId = post.attr("data-comment-id"); String dataToCommentId = post.attr("data-to-comment-id"); message.setMID(new PointMessageID(message.User.UName, dataId, 0)); if (dataCommentId.length() > 0) { message.setRID(Integer.parseInt(dataCommentId)); } if (dataToCommentId.length() > 0) { message.setReplyTo(Integer.parseInt(dataToCommentId)); } message.tags = new Vector<String>(); for (Element el : post.select("a[class=tag]")) { message.tags.add(el.text()); } message.microBlogCode = PointMessageID.CODE; StringBuilder dt = new StringBuilder(); for (Element el : post.select("div[class=created]")) { dt.append(" "); dt.append(el.text()); } try { message.Timestamp = new Date(sdf.parse(currentYear + " " + dt.toString().trim())); } catch (IllegalArgumentException e) { try { message.Timestamp = new Date(sdf2.parse(currentYear + " " + dt.toString().trim())); } catch (IllegalArgumentException e1) { continue; } } Date mt = message.Timestamp; if (mt.getTime() > System.currentTimeMillis() + 50 * 24 * 60 * 60 * 1000L) { Calendar cal2 = Calendar.getInstance(); cal2.setTime(mt); cal2.set(Calendar.YEAR, cal2.get(Calendar.YEAR) - 1); message.Timestamp = cal2.getTime(); } Elements postEls = post.select("div[class=text-content]"); if (postEls.size() < 1) { postEls = post.select("div[class=text]"); } String referencedImages = ""; Elements postimg = post.select("a[class=postimg]"); for (Element as : postimg) { Elements imgs = as.select("img"); for (Element img : imgs) { String src = img.attr("src"); referencedImages += " " + src; } } message.csrf_token = post.select("input[name=csrf_token]").attr("value"); // last part if (postEls.size() < 1) { message.Text = "Error parsing text ;-("; } else { Element elem = cleanupElement(postEls.get(0)); postEls.get(0).appendChild(elem); // add to document Document.OutputSettings os = elem.ownerDocument().outputSettings(); os.prettyPrint(false); String text = Utils.replace(elem.toString(), "\n", " "); text = Utils.replace(text, "&", "&"); // this was improperly done in cleanupElement while (true) { long olds = text.length(); text = Utils.replace(text, " ", " "); long news = text.length(); if (news == olds) break; } try { message.replies = Integer.parseInt(post.select("span[class=cn]").text()); } catch (Exception ex) { } text += referencedImages; message.Text = unwebMessageTextPoint(text); } retval.add(message); } } for (JuickMessage juickMessage : retval) { if (juickMessage.getRID() != 0 && juickMessage.getReplyTo() != 0 && !juickMessage.Text.startsWith("@")) { String uzur = null; for (JuickMessage scan : retval) { if (scan.getRID() == juickMessage.getReplyTo()) { uzur = scan.User.UName; break; } } if (uzur != null) { juickMessage.Text = "@" + uzur + " " + juickMessage.Text; } } } return retval; }
// start setting of selectorbar public void selectorBarTranslate( Node selectorBarPanelNode, Element ele, Map<String, String> urlMap, String locale) { try { String title = (ele != null ? ele.getElementsByTag("a").first().text() : ""); String titleUrl = ele.getElementsByTag("a").first().absUrl("href"); if (StringUtil.isBlank(titleUrl)) { titleUrl = ele.getElementsByTag("a").first().attr("href"); } // Start extracting valid href log.debug("Before selector bar title LinkUrl" + titleUrl + "\n"); titleUrl = FrameworkUtils.getLocaleReference(titleUrl, urlMap, locale, sb); log.debug("after selector bar title LinkUrl" + titleUrl + "\n"); // End extracting valid href log.debug("selector component titleUrl: " + titleUrl); selectorBarPanelNode.setProperty("title", title); selectorBarPanelNode.setProperty("titleurl", titleUrl); if (ele.childNodeSize() >= 2) { log.debug("Child node size is greater than 1."); if (ele.select("div.menu").isEmpty()) { log.debug("Menu is not available."); sb.append( "<li>Selector bar drop down menu elements does not exist on the locale page.</li>"); } else { log.debug("Menu is available."); Element menuEle = ele.child(1); if (menuEle != null) { log.debug("selector component menuEle: " + menuEle.toString()); Element anchor = menuEle.getElementsByTag("a").last(); String allLinkText = anchor != null ? anchor.text() : ""; String allLinkUrl = anchor != null ? anchor.absUrl("href") : ""; if (StringUtil.isBlank(allLinkUrl)) { allLinkUrl = anchor.attr("href"); } // Start extracting valid href log.debug("Before selector bar menu LinkUrl" + allLinkUrl + "\n"); allLinkUrl = FrameworkUtils.getLocaleReference(allLinkUrl, urlMap, locale, sb); log.debug("after selector bar menu LinkUrl" + allLinkUrl + "\n"); // End extracting valid href selectorBarPanelNode.setProperty("alllinktext", allLinkText); selectorBarPanelNode.setProperty("alllinkurl", allLinkUrl); Elements menuUlList = menuEle.getElementsByTag("ul"); for (Element element : menuUlList) { java.util.List<String> list = new ArrayList<String>(); Elements menuLiList = element.getElementsByTag("li"); System.out.println(menuLiList.size()); for (Element li : menuLiList) { JSONObject jsonObj = new JSONObject(); Element listItemAnchor = li.getElementsByTag("a").first(); String anchorText = listItemAnchor != null ? listItemAnchor.text() : ""; String anchorHref = listItemAnchor.absUrl("href"); if (StringUtil.isBlank(anchorHref)) { anchorHref = listItemAnchor.attr("href"); } // Start extracting valid href log.debug("Before selectorbarLinkUrl" + anchorHref + "\n"); anchorHref = FrameworkUtils.getLocaleReference(anchorHref, urlMap, locale, sb); log.debug("after selectorbarLinkUrl" + anchorHref + "\n"); // End extracting valid href jsonObj.put("linktext", anchorText); jsonObj.put("linkurl", anchorHref); jsonObj.put("size", ""); list.add(jsonObj.toString()); } selectorBarPanelNode.setProperty("panelitems", list.toArray(new String[list.size()])); } } else { sb.append( "<li>Selector bar drop down menu elements does not exist on the locale page.</li>"); } } } else { sb.append( "<li>Selector bar drop down menu elements does not exist on the locale page.</li>"); } } catch (Exception e) { e.printStackTrace(); } }
/** * 新闻的 url 格式为 http://see.xidian.edu.cn/html/news/7928.html * * @param id 某个新闻页面的序号 * @return 爬取该页面上的新闻信息,提取相应的信息,存到新闻bean里。如果没有爬取到新闻返回null * @throws Exception */ public static ArticleItem parseNewsItem(int id) throws Exception { // 根据后缀的数字,拼接新闻 url String urlStr = Constant.ARTICLE_BASE_URL + id + ".html"; // 利用get请求获取字符串再解析会有小部分乱码 // String htmlStr = HttpTool.doGet(urlStr); // Document doc = Jsoup.parse(htmlStr); // try { Document doc = Jsoup.connect(urlStr).timeout(10000).get(); // 去掉jsoup对html字符串加的"\n",方便json字符串返回 doc.outputSettings().prettyPrint(false); Element articleEle = doc.getElementById("article"); // 标题 Element titleEle = articleEle.getElementById("article_title"); String titleStr = titleEle.text(); // article_detail包括了 2016-01-15 来源: 浏览次数:177 Element detailEle = articleEle.getElementById("article_detail"); Elements details = detailEle.getElementsByTag("span"); // 发布时间 String dateStr = details.get(0).text(); // 新闻来源 String sourceStr = details.get(1).text(); // 去掉"来源:" if (SOURCE_PREFIX.equals(sourceStr.trim())) { sourceStr = "SeeNews"; } else { sourceStr = sourceStr.substring(3).trim(); } // 访问这个新闻页面,浏览次数会+1,次数是 JS 渲染的 String jsStr = HttpTool.doGet(COUNT_BASE_URL + id); int readTimes = Integer.parseInt(jsStr.replaceAll("\\D+", "")); // 或者使用下面这个正则方法 // String readTimesStr = jsStr.replaceAll("[^0-9]", ""); Element contentEle = articleEle.getElementById("article_content"); // 新闻主体内容 String contentStr = contentEle.toString(); // 如果用 text()方法,新闻主体内容的 html 标签会丢失 // 为了在 Android 上用 WebView 显示 html,用toString() // String contentStr = contentEle.text(); Elements images = contentEle.getElementsByTag("img"); String[] imageUrls = new String[images.size()]; // 图片上传到七牛 // 将body中的图片地址替换为七牛的地址 for (int i = 0; i < imageUrls.length; i++) { String origin = images.get(i).attr("src"); imageUrls[i] = ImageTool.convertUrl(id, origin); if (!origin.equals(imageUrls[i])) { // 只有上传图片到七牛,url 才会变化 // 不相等,才替换为七牛的url contentStr = contentStr.replace( Constant.SRC_PREFIX + origin, Constant.SRC_PREFIX + Constant.BUCKET_HOST_NAME + imageUrls[i]); } } // 处理相对路径 url,不和上面的 image url 冲突 Elements hrefs = contentEle.getElementsByTag("a"); for (int i = 0; i < hrefs.size(); i++) { String origin = hrefs.get(i).attr("href"); if (Constant.DEBUG) { System.out.println("原始 href=" + origin); } String newUrl = UrlTool.dealAttachmentUrl(id, origin); // 防止页面的附件 重复出现,替换多次 // 出现这种 // http://see.xidian.edu.cnhttp://see.xidian.edu.cn/uploads/file if (!origin.equals(newUrl)) { // 不相等,才替换为新的url 且url未被替换过 contentStr = contentStr.replace(Constant.HREF_PREFIX + origin, Constant.HREF_PREFIX + newUrl); } } return new ArticleItem(id, imageUrls, titleStr, dateStr, readTimes, sourceStr, contentStr); }
@Override public MediaMetadata getMetadata(MediaScrapeOptions options) throws Exception { LOGGER.debug("getMetadata() " + options.toString()); if (options.getType() != MediaType.MOVIE) { throw new UnsupportedMediaTypeException(options.getType()); } String id = ""; if (StringUtils.isNotBlank(options.getId(providerInfo.getId()))) { id = options.getId(providerInfo.getId()); } if (StringUtils.isBlank(id) && options.getResult() != null) { if (StringUtils.isEmpty(options.getResult().getId())) { id = StrgUtils.substr(options.getResult().getUrl(), "id=(.*?)"); } else { id = options.getResult().getId(); } } // we can not scrape without zelluloid id and url if (StringUtils.isBlank(id) && StringUtils.isBlank(options.getResult().getUrl())) { throw new Exception("cannot scrape without id and url"); } String detailurl = BASE_URL + "/filme/index.php3?id=" + id; if (StringUtils.isBlank(id)) { detailurl = options.getResult().getUrl(); } MediaMetadata md = new MediaMetadata(providerInfo.getId()); Url url; try { url = new CachedUrl(detailurl); InputStream in = url.getInputStream(); Document doc = Jsoup.parse(in, PAGE_ENCODING, ""); in.close(); // parse title String title = doc.getElementsByAttributeValue("property", "og:title").attr("content").trim(); md.setTitle(title); // parse plot String plot = doc.getElementsByAttributeValue("class", "bigtext").text(); md.setPlot(plot); md.setTagline(plot.length() > 150 ? plot.substring(0, 150) : plot); // parse poster Elements el = doc.getElementsByAttributeValueStarting("src", "/images/poster"); if (el.size() == 1) { // Poster MediaArtwork ma = new MediaArtwork(providerInfo.getId(), MediaArtwork.MediaArtworkType.POSTER); ma.setPreviewUrl(BASE_URL + el.get(0).attr("src")); ma.setDefaultUrl(BASE_URL + el.get(0).attr("src")); ma.setLanguage(options.getLanguage().getLanguage()); md.addMediaArt(ma); } // parse year el = doc.getElementsByAttributeValueContaining("href", "az.php3?j="); if (el.size() == 1) { try { md.setYear(Integer.parseInt(el.get(0).text())); } catch (Exception ignored) { } } // parse cinema release el = doc.getElementsByAttributeValueContaining("href", "?v=w"); if (el.size() > 0) { try { SimpleDateFormat sdf = new SimpleDateFormat("dd.MM.yyyy"); Date d = sdf.parse(el.get(0).text()); md.setReleaseDate(d); } catch (Exception e) { LOGGER.warn("cannot parse cinema release date: " + el.get(0).text()); } } // parse original title md.setOriginalTitle(StrgUtils.substr(doc.toString(), "Originaltitel: (.*?)\\<")); if (StringUtils.isEmpty(md.getOriginalTitle())) { md.setOriginalTitle(md.getTitle()); } // parse runtime String rt = (StrgUtils.substr(doc.toString(), "ca. (.*?) min")); if (!rt.isEmpty()) { try { md.setRuntime(Integer.valueOf(rt)); } catch (Exception e2) { LOGGER.warn("cannot convert runtime: " + rt); } } // parse genres el = doc.getElementsByAttributeValueContaining("href", "az.php3?g="); for (Element g : el) { String gid = g.attr("href").substring(g.attr("href").lastIndexOf('=') + 1); md.addGenre(getTmmGenre(gid)); } // parse cert // FSK: ab 12, $230 Mio. Budget String fsk = StrgUtils.substr(doc.toString(), "FSK: (.*?)[,<]"); if (!fsk.isEmpty()) { md.addCertification(Certification.findCertification(fsk)); } // parse rating Elements ratings = doc.getElementsByAttributeValue("class", "ratingBarTable"); if (ratings.size() == 2) { // get user rating Element e = ratings.get(1); // <div>87%</div> String r = e.getElementsByTag("div").text().replace("%", ""); try { md.setRating(Float.valueOf(r) / 10); // only 0-10 } catch (Exception e2) { LOGGER.warn("cannot convert rating: " + r); } } // details page doc = null; String detailsUrl = BASE_URL + "/filme/details.php3?id=" + id; try { url = new CachedUrl(detailsUrl); in = url.getInputStream(); doc = Jsoup.parse(in, PAGE_ENCODING, ""); in.close(); } catch (Exception e) { LOGGER.error("failed to get details: " + e.getMessage()); } if (doc != null) { Element tab = doc.getElementById("ccdetails"); int header = 0; String lastRole = ""; for (Element tr : tab.getElementsByTag("tr")) { if (tr.toString().contains("dyngfx")) { // header gfx if (tr.toString().contains("Besetzung")) { header = 1; } else if (tr.toString().contains("Crew")) { header = 2; } else if (tr.toString().contains("Produktion")) { // company, not producers header = 3; } else if (tr.toString().contains("Verleih")) { header = 4; } else if (tr.toString().contains("Alternativtitel")) { header = 5; } continue; } else { // no header gfx, so data MediaCastMember mcm = new MediaCastMember(); el = tr.getElementsByTag("td"); if (header == 1) { // actors if (el.size() == 2) { String role = "" + el.get(0).text().trim(); // text() decodes to \u00a0 if (role.equals("\u00a0") || StringUtils.isBlank(role)) { continue; } mcm.setCharacter(role); mcm.setName(el.get(1).getElementsByTag("a").text()); mcm.setId( StrgUtils.substr(el.get(1).getElementsByTag("a").attr("href"), "id=(\\d+)")); mcm.setType(MediaCastMember.CastType.ACTOR); md.addCastMember(mcm); // parsing actor pages would we too heavy here just for actor images.. } } else if (header == 2) { // crew if (el.size() == 2) { String crewrole = el.get(0).html().trim(); mcm.setName(el.get(1).getElementsByTag("a").text()); if (crewrole.equals(" ")) { crewrole = lastRole; // pop previous } else { lastRole = crewrole; // push new } mcm.setPart(crewrole); switch (crewrole) { case "Regie": mcm.setType(MediaCastMember.CastType.DIRECTOR); break; case "Drehbuch": mcm.setType(MediaCastMember.CastType.WRITER); break; case "Produktion": mcm.setType(MediaCastMember.CastType.PRODUCER); break; default: mcm.setType(MediaCastMember.CastType.OTHER); break; } mcm.setId( StrgUtils.substr(el.get(1).getElementsByTag("a").attr("href"), "id=(\\d+)")); md.addCastMember(mcm); } } else if (header == 3) { // production md.addProductionCompany(el.get(0).text()); } } } } // get links page doc = null; String linksUrl = BASE_URL + "/filme/links.php3?id=" + id; try { url = new CachedUrl(linksUrl); in = url.getInputStream(); doc = Jsoup.parse(in, PAGE_ENCODING, ""); in.close(); } catch (Exception e) { LOGGER.error("failed to get links page: " + e.getMessage()); } if (doc != null) { el = doc.getElementsByAttributeValueContaining("href", "german.imdb.com"); if (el != null && el.size() > 0) { String imdb = StrgUtils.substr(el.get(0).attr("href"), "(tt\\d{7})"); if (imdb.isEmpty()) { imdb = "tt" + StrgUtils.substr(el.get(0).attr("href"), "\\?(\\d+)"); } md.setId(MediaMetadata.IMDB, imdb); } } } catch (Exception e) { LOGGER.error("Error parsing " + detailurl); throw e; } return md; }