protected Book bookInfo(Element bookIcon) { Document document = null; String productId = bookIcon.attr("data-product-id"); try { URL url = new URL("http://www.labirint.ru/books/" + productId); document = Jsoup.parse(url, 5000); } catch (MalformedURLException ex) { } catch (IOException ex) { } Book book = new Book(); Element bookElement = document.getElementById("product"); String[] strs; String str; book.setProductId(Integer.parseInt(productId)); try { String name; name = bookElement.getElementById("product-title").getElementsByTag("h1").text().trim(); book.setName((name.length() <= 150) ? name : name.substring(0, 149)); } catch (Exception ex) { book.setName(""); } try { String author = ""; String editor = ""; Elements authorElements = bookElement.getElementById("product-specs").getElementsByClass("authors"); Iterator<Element> iterator = authorElements.iterator(); while (iterator.hasNext()) { str = iterator.next().text(); if (str.contains("Автор")) { strs = str.split(":"); if (strs.length == 2) author = strs[1].replaceAll("\"", "").trim(); else author = strs[0].replaceAll("\"", "").trim(); } if (str.contains("Редактор")) { strs = str.split(":"); if (strs.length == 2) editor = strs[1].replaceAll("\"", "").trim(); else editor = strs[0].replaceAll("\"", "").trim(); } } book.setAuthor((author.length() <= 100) ? author : author.substring(0, 99)); book.setEditor((editor.length() <= 100) ? editor : editor.substring(0, 99)); } catch (Exception ex) { book.setAuthor(""); book.setEditor(""); } try { String publisherAndYear = ""; str = bookElement.getElementById("product-specs").getElementsByClass("publisher").get(0).text(); strs = str.split(":"); if (strs.length == 2) publisherAndYear = strs[1].replaceAll("\"", "").trim(); else publisherAndYear = strs[0].replaceAll("\"", "").trim(); book.setPublisherAndYear( (publisherAndYear.length() <= 200) ? publisherAndYear : publisherAndYear.substring(0, 199)); } catch (Exception ex) { book.setPublisherAndYear(""); } try { if (bookElement .getElementById("product-specs") .getElementsByClass("buying-pricenew-val-number") .size() > 0) str = bookElement .getElementById("product-specs") .getElementsByClass("buying-pricenew-val-number") .get(0) .text(); else str = bookElement .getElementById("product-specs") .getElementsByClass("buying-price-val-number") .get(0) .text(); book.setPrice(Double.parseDouble(str.length() <= 10 ? str : str.substring(0, 9))); } catch (Exception ex) { book.setPrice(null); } try { String isbn = ""; str = bookElement.getElementById("product-specs").getElementsByClass("isbn").get(0).text(); strs = str.split(":"); if (strs.length == 2) isbn = strs[1].replaceAll("\"", "").trim(); else isbn = strs[0].replaceAll("\"", "").trim(); book.setIsbn((isbn.length() <= 20) ? isbn : isbn.substring(0, 19)); } catch (Exception ex) { book.setIsbn(""); } try { String pagesCount = ""; str = bookElement.getElementById("product-specs").getElementsByClass("pages2").get(0).text(); strs = str.split(":"); if (strs.length == 2) pagesCount = strs[1].replaceAll("\"", "").trim(); else pagesCount = strs[0].replaceAll("\"", "").trim(); book.setPagesCount((pagesCount.length() <= 30) ? pagesCount : pagesCount.substring(0, 29)); } catch (Exception ex) { book.setPagesCount(""); } try { String decor = ""; Document decorHtml = Jsoup.parse(new URL("http://www.labirint.ru/ajax/design/" + productId), 5000); book.setDecor( (decorHtml.text().length() <= 400) ? decorHtml.text() : decorHtml.text().substring(0, 399)); } catch (Exception ex) { book.setDecor(""); } try { String weight = ""; str = bookElement.getElementById("product-specs").getElementsByClass("weight").get(0).text(); strs = str.split(":"); if (strs.length == 2) weight = strs[1].replaceAll("\"", "").trim(); else weight = strs[0].replaceAll("\"", "").trim(); book.setWeight((weight.length() <= 20) ? weight : weight.substring(0, 19)); } catch (Exception ex) { book.setWeight(""); } try { String dimensions = ""; str = bookElement .getElementById("product-specs") .getElementsByClass("dimensions") .get(0) .text(); strs = str.split(":"); if (strs.length == 2) dimensions = strs[1].replaceAll("\"", "").trim(); else dimensions = strs[0].replaceAll("\"", "").trim(); book.setDimensions((dimensions.length() <= 30) ? dimensions : dimensions.substring(0, 29)); } catch (Exception ex) { book.setDimensions(""); } try { if (bookElement.getElementById("fullannotation") != null) book.setAnnotation( (bookElement.getElementById("fullannotation").text().length() <= 2000) ? bookElement.getElementById("fullannotation").text() : bookElement.getElementById("fullannotation").text().substring(0, 1999)); else if (bookElement.getElementById("product-about") != null) book.setAnnotation( (bookElement.getElementById("product-about").text().length() <= 2000) ? bookElement.getElementById("product-about").text() : bookElement.getElementById("product-about").text().substring(0, 1999)); else book.setAnnotation( (bookElement.getElementById("smallannotation").text().length() <= 2000) ? bookElement.getElementById("smallannotation").text() : bookElement.getElementById("smallannotation").text().substring(0, 1999)); } catch (Exception ex) { book.setAnnotation(""); } try { String imageUrl = ""; if (document.getElementsByAttributeValue("property", "og:image") != null) imageUrl = document.getElementsByAttributeValue("property", "og:image").get(0).attr("content"); else imageUrl = "http://img.labirint.ru/design/emptycover.png"; book.setCoverImgUrl(imageUrl); } catch (Exception ex) { book.setCoverImgUrl(""); } if (document.getElementById("product-comments") != null) book.setComments(parseComments(document.getElementById("product-comments"), book)); return book; }
private Element fromId(String id, Element doc) { return doc.getElementById(id); }
/** * 新闻的 url 格式为 http://see.xidian.edu.cn/html/news/7928.html * * @param id 某个新闻页面的序号 * @return 爬取该页面上的新闻信息,提取相应的信息,存到新闻bean里。如果没有爬取到新闻返回null * @throws Exception */ public static ArticleItem parseNewsItem(int id) throws Exception { // 根据后缀的数字,拼接新闻 url String urlStr = Constant.ARTICLE_BASE_URL + id + ".html"; // 利用get请求获取字符串再解析会有小部分乱码 // String htmlStr = HttpTool.doGet(urlStr); // Document doc = Jsoup.parse(htmlStr); // try { Document doc = Jsoup.connect(urlStr).timeout(10000).get(); // 去掉jsoup对html字符串加的"\n",方便json字符串返回 doc.outputSettings().prettyPrint(false); Element articleEle = doc.getElementById("article"); // 标题 Element titleEle = articleEle.getElementById("article_title"); String titleStr = titleEle.text(); // article_detail包括了 2016-01-15 来源: 浏览次数:177 Element detailEle = articleEle.getElementById("article_detail"); Elements details = detailEle.getElementsByTag("span"); // 发布时间 String dateStr = details.get(0).text(); // 新闻来源 String sourceStr = details.get(1).text(); // 去掉"来源:" if (SOURCE_PREFIX.equals(sourceStr.trim())) { sourceStr = "SeeNews"; } else { sourceStr = sourceStr.substring(3).trim(); } // 访问这个新闻页面,浏览次数会+1,次数是 JS 渲染的 String jsStr = HttpTool.doGet(COUNT_BASE_URL + id); int readTimes = Integer.parseInt(jsStr.replaceAll("\\D+", "")); // 或者使用下面这个正则方法 // String readTimesStr = jsStr.replaceAll("[^0-9]", ""); Element contentEle = articleEle.getElementById("article_content"); // 新闻主体内容 String contentStr = contentEle.toString(); // 如果用 text()方法,新闻主体内容的 html 标签会丢失 // 为了在 Android 上用 WebView 显示 html,用toString() // String contentStr = contentEle.text(); Elements images = contentEle.getElementsByTag("img"); String[] imageUrls = new String[images.size()]; // 图片上传到七牛 // 将body中的图片地址替换为七牛的地址 for (int i = 0; i < imageUrls.length; i++) { String origin = images.get(i).attr("src"); imageUrls[i] = ImageTool.convertUrl(id, origin); if (!origin.equals(imageUrls[i])) { // 只有上传图片到七牛,url 才会变化 // 不相等,才替换为七牛的url contentStr = contentStr.replace( Constant.SRC_PREFIX + origin, Constant.SRC_PREFIX + Constant.BUCKET_HOST_NAME + imageUrls[i]); } } // 处理相对路径 url,不和上面的 image url 冲突 Elements hrefs = contentEle.getElementsByTag("a"); for (int i = 0; i < hrefs.size(); i++) { String origin = hrefs.get(i).attr("href"); if (Constant.DEBUG) { System.out.println("原始 href=" + origin); } String newUrl = UrlTool.dealAttachmentUrl(id, origin); // 防止页面的附件 重复出现,替换多次 // 出现这种 // http://see.xidian.edu.cnhttp://see.xidian.edu.cn/uploads/file if (!origin.equals(newUrl)) { // 不相等,才替换为新的url 且url未被替换过 contentStr = contentStr.replace(Constant.HREF_PREFIX + origin, Constant.HREF_PREFIX + newUrl); } } return new ArticleItem(id, imageUrls, titleStr, dateStr, readTimes, sourceStr, contentStr); }