protected Book bookInfo(Element bookIcon) {
    Document document = null;
    String productId = bookIcon.attr("data-product-id");
    try {
      URL url = new URL("http://www.labirint.ru/books/" + productId);
      document = Jsoup.parse(url, 5000);
    } catch (MalformedURLException ex) {

    } catch (IOException ex) {

    }
    Book book = new Book();
    Element bookElement = document.getElementById("product");

    String[] strs;
    String str;

    book.setProductId(Integer.parseInt(productId));

    try {
      String name;
      name = bookElement.getElementById("product-title").getElementsByTag("h1").text().trim();
      book.setName((name.length() <= 150) ? name : name.substring(0, 149));
    } catch (Exception ex) {
      book.setName("");
    }

    try {
      String author = "";
      String editor = "";
      Elements authorElements =
          bookElement.getElementById("product-specs").getElementsByClass("authors");
      Iterator<Element> iterator = authorElements.iterator();
      while (iterator.hasNext()) {
        str = iterator.next().text();
        if (str.contains("Автор")) {
          strs = str.split(":");
          if (strs.length == 2) author = strs[1].replaceAll("\"", "").trim();
          else author = strs[0].replaceAll("\"", "").trim();
        }
        if (str.contains("Редактор")) {
          strs = str.split(":");
          if (strs.length == 2) editor = strs[1].replaceAll("\"", "").trim();
          else editor = strs[0].replaceAll("\"", "").trim();
        }
      }
      book.setAuthor((author.length() <= 100) ? author : author.substring(0, 99));
      book.setEditor((editor.length() <= 100) ? editor : editor.substring(0, 99));
    } catch (Exception ex) {
      book.setAuthor("");
      book.setEditor("");
    }

    try {
      String publisherAndYear = "";
      str =
          bookElement.getElementById("product-specs").getElementsByClass("publisher").get(0).text();
      strs = str.split(":");
      if (strs.length == 2) publisherAndYear = strs[1].replaceAll("\"", "").trim();
      else publisherAndYear = strs[0].replaceAll("\"", "").trim();
      book.setPublisherAndYear(
          (publisherAndYear.length() <= 200)
              ? publisherAndYear
              : publisherAndYear.substring(0, 199));
    } catch (Exception ex) {
      book.setPublisherAndYear("");
    }

    try {
      if (bookElement
              .getElementById("product-specs")
              .getElementsByClass("buying-pricenew-val-number")
              .size()
          > 0)
        str =
            bookElement
                .getElementById("product-specs")
                .getElementsByClass("buying-pricenew-val-number")
                .get(0)
                .text();
      else
        str =
            bookElement
                .getElementById("product-specs")
                .getElementsByClass("buying-price-val-number")
                .get(0)
                .text();
      book.setPrice(Double.parseDouble(str.length() <= 10 ? str : str.substring(0, 9)));
    } catch (Exception ex) {
      book.setPrice(null);
    }

    try {
      String isbn = "";
      str = bookElement.getElementById("product-specs").getElementsByClass("isbn").get(0).text();
      strs = str.split(":");
      if (strs.length == 2) isbn = strs[1].replaceAll("\"", "").trim();
      else isbn = strs[0].replaceAll("\"", "").trim();
      book.setIsbn((isbn.length() <= 20) ? isbn : isbn.substring(0, 19));
    } catch (Exception ex) {
      book.setIsbn("");
    }

    try {
      String pagesCount = "";
      str = bookElement.getElementById("product-specs").getElementsByClass("pages2").get(0).text();
      strs = str.split(":");
      if (strs.length == 2) pagesCount = strs[1].replaceAll("\"", "").trim();
      else pagesCount = strs[0].replaceAll("\"", "").trim();
      book.setPagesCount((pagesCount.length() <= 30) ? pagesCount : pagesCount.substring(0, 29));
    } catch (Exception ex) {
      book.setPagesCount("");
    }

    try {
      String decor = "";
      Document decorHtml =
          Jsoup.parse(new URL("http://www.labirint.ru/ajax/design/" + productId), 5000);
      book.setDecor(
          (decorHtml.text().length() <= 400)
              ? decorHtml.text()
              : decorHtml.text().substring(0, 399));
    } catch (Exception ex) {
      book.setDecor("");
    }

    try {
      String weight = "";
      str = bookElement.getElementById("product-specs").getElementsByClass("weight").get(0).text();
      strs = str.split(":");
      if (strs.length == 2) weight = strs[1].replaceAll("\"", "").trim();
      else weight = strs[0].replaceAll("\"", "").trim();
      book.setWeight((weight.length() <= 20) ? weight : weight.substring(0, 19));
    } catch (Exception ex) {
      book.setWeight("");
    }

    try {
      String dimensions = "";
      str =
          bookElement
              .getElementById("product-specs")
              .getElementsByClass("dimensions")
              .get(0)
              .text();
      strs = str.split(":");
      if (strs.length == 2) dimensions = strs[1].replaceAll("\"", "").trim();
      else dimensions = strs[0].replaceAll("\"", "").trim();
      book.setDimensions((dimensions.length() <= 30) ? dimensions : dimensions.substring(0, 29));
    } catch (Exception ex) {
      book.setDimensions("");
    }

    try {
      if (bookElement.getElementById("fullannotation") != null)
        book.setAnnotation(
            (bookElement.getElementById("fullannotation").text().length() <= 2000)
                ? bookElement.getElementById("fullannotation").text()
                : bookElement.getElementById("fullannotation").text().substring(0, 1999));
      else if (bookElement.getElementById("product-about") != null)
        book.setAnnotation(
            (bookElement.getElementById("product-about").text().length() <= 2000)
                ? bookElement.getElementById("product-about").text()
                : bookElement.getElementById("product-about").text().substring(0, 1999));
      else
        book.setAnnotation(
            (bookElement.getElementById("smallannotation").text().length() <= 2000)
                ? bookElement.getElementById("smallannotation").text()
                : bookElement.getElementById("smallannotation").text().substring(0, 1999));

    } catch (Exception ex) {
      book.setAnnotation("");
    }

    try {
      String imageUrl = "";
      if (document.getElementsByAttributeValue("property", "og:image") != null)
        imageUrl =
            document.getElementsByAttributeValue("property", "og:image").get(0).attr("content");
      else imageUrl = "http://img.labirint.ru/design/emptycover.png";
      book.setCoverImgUrl(imageUrl);
    } catch (Exception ex) {
      book.setCoverImgUrl("");
    }

    if (document.getElementById("product-comments") != null)
      book.setComments(parseComments(document.getElementById("product-comments"), book));

    return book;
  }
 private Element fromId(String id, Element doc) {
   return doc.getElementById(id);
 }
예제 #3
0
  /**
   * 新闻的 url 格式为 http://see.xidian.edu.cn/html/news/7928.html
   *
   * @param id 某个新闻页面的序号
   * @return 爬取该页面上的新闻信息,提取相应的信息,存到新闻bean里。如果没有爬取到新闻返回null
   * @throws Exception
   */
  public static ArticleItem parseNewsItem(int id) throws Exception {
    // 根据后缀的数字,拼接新闻 url
    String urlStr = Constant.ARTICLE_BASE_URL + id + ".html";

    // 利用get请求获取字符串再解析会有小部分乱码
    // String htmlStr = HttpTool.doGet(urlStr);
    // Document doc = Jsoup.parse(htmlStr);
    // try {
    Document doc = Jsoup.connect(urlStr).timeout(10000).get();
    // 去掉jsoup对html字符串加的"\n",方便json字符串返回
    doc.outputSettings().prettyPrint(false);

    Element articleEle = doc.getElementById("article");
    // 标题
    Element titleEle = articleEle.getElementById("article_title");
    String titleStr = titleEle.text();

    // article_detail包括了 2016-01-15 来源: 浏览次数:177
    Element detailEle = articleEle.getElementById("article_detail");
    Elements details = detailEle.getElementsByTag("span");

    // 发布时间
    String dateStr = details.get(0).text();

    // 新闻来源
    String sourceStr = details.get(1).text();

    // 去掉"来源:"
    if (SOURCE_PREFIX.equals(sourceStr.trim())) {
      sourceStr = "SeeNews";
    } else {
      sourceStr = sourceStr.substring(3).trim();
    }

    // 访问这个新闻页面,浏览次数会+1,次数是 JS 渲染的
    String jsStr = HttpTool.doGet(COUNT_BASE_URL + id);
    int readTimes = Integer.parseInt(jsStr.replaceAll("\\D+", ""));
    // 或者使用下面这个正则方法
    // String readTimesStr = jsStr.replaceAll("[^0-9]", "");

    Element contentEle = articleEle.getElementById("article_content");
    // 新闻主体内容

    String contentStr = contentEle.toString();

    // 如果用 text()方法,新闻主体内容的 html 标签会丢失
    // 为了在 Android 上用 WebView 显示 html,用toString()
    // String contentStr = contentEle.text();
    Elements images = contentEle.getElementsByTag("img");
    String[] imageUrls = new String[images.size()];

    // 图片上传到七牛
    // 将body中的图片地址替换为七牛的地址
    for (int i = 0; i < imageUrls.length; i++) {
      String origin = images.get(i).attr("src");
      imageUrls[i] = ImageTool.convertUrl(id, origin);
      if (!origin.equals(imageUrls[i])) {
        // 只有上传图片到七牛,url 才会变化
        // 不相等,才替换为七牛的url
        contentStr =
            contentStr.replace(
                Constant.SRC_PREFIX + origin,
                Constant.SRC_PREFIX + Constant.BUCKET_HOST_NAME + imageUrls[i]);
      }
    }

    // 处理相对路径 url,不和上面的 image url 冲突
    Elements hrefs = contentEle.getElementsByTag("a");
    for (int i = 0; i < hrefs.size(); i++) {
      String origin = hrefs.get(i).attr("href");
      if (Constant.DEBUG) {
        System.out.println("原始 href=" + origin);
      }
      String newUrl = UrlTool.dealAttachmentUrl(id, origin);

      // 防止页面的附件 重复出现,替换多次
      // 出现这种
      // http://see.xidian.edu.cnhttp://see.xidian.edu.cn/uploads/file
      if (!origin.equals(newUrl)) {
        // 不相等,才替换为新的url 且url未被替换过
        contentStr =
            contentStr.replace(Constant.HREF_PREFIX + origin, Constant.HREF_PREFIX + newUrl);
      }
    }

    return new ArticleItem(id, imageUrls, titleStr, dateStr, readTimes, sourceStr, contentStr);
  }