Beispiel #1
0
  @Test
  public void parentlessToString() {
    Document doc = Jsoup.parse("<img src='foo'>");
    Element img = doc.select("img").first();
    assertEquals("<img src=\"foo\">", img.toString());

    img.remove(); // lost its parent
    assertEquals("<img src=\"foo\">", img.toString());
  }
Beispiel #2
0
 private static String getTrailer(Movie movie) {
   String trailerLink = "";
   if (Integer.valueOf(movie.getMovieYear()) < 1990) {
     trailerLink = "null";
   } else {
     trailerLink += "http://www.youtube.com";
     String link = formatYoutubeString(movie.getMovieName());
     try {
       Document d = Jsoup.connect("http://www.youtube.com/" + link).get();
       Element e = d.body();
       String html = e.toString();
       String linkDiv = "";
       int max = html.indexOf("class=\"yt-lockup-title \"><a href=\"") + 100;
       for (int i = html.indexOf("class=\"yt-lockup-title \"><a href=\""); i < max; i++) {
         linkDiv += html.charAt(i);
       }
       for (int i = linkDiv.indexOf("<a href=\"") + 9;
           i < linkDiv.indexOf("class=\"yt-uix-sessionlink") - 2;
           i++) {
         trailerLink += linkDiv.charAt(i);
       }
     } catch (Exception e) {
       System.out.println(e.toString());
     }
   }
   return trailerLink;
 }
Beispiel #3
0
 /**
  * getMovieActors parses through the movie's page html and returns three actors.
  *
  * @author defq0n
  * @param pageLink is the extended imdb url for the movie page.
  * @return movieActors String containing three actors.
  */
 private static String[] getMovieActors(String pageLink) {
   String[] movieActors = {"", "", ""};
   try {
     Document d = Jsoup.connect("http://imdb.com" + pageLink).get();
     Element e = d.body();
     String html = e.toString();
     String actorsDiv = "";
     for (int i = html.indexOf("<h4 class=\"inline\">Stars:</h4>") + 30;
         i < html.indexOf("See full cast and crew");
         i++) {
       actorsDiv += html.charAt(i);
     }
     String tempDiv = actorsDiv;
     for (int i = 0; i < 3; i++) { // we will get the first three top actors
       String actor = "";
       String t = "itemprop=\"url\"><span class=\"itemprop\" itemprop=\"name\">";
       for (int j = tempDiv.indexOf(t) + t.length(); j < tempDiv.indexOf("</span></a>"); j++) {
         actor += tempDiv.charAt(j);
       }
       movieActors[i] = actor;
       tempDiv = "";
       for (int j = actorsDiv.indexOf(actor + "</span>") + actor.length() + 7;
           j < actorsDiv.length();
           j++) {
         tempDiv += actorsDiv.charAt(j);
       }
     }
   } catch (Exception e) {
     System.out.println(e.toString());
   }
   return movieActors;
 }
Beispiel #4
0
  public static void readHead() {
    String url = "http://www.2177s.com";
    try {
      Document doc = Jsoup.connect(url).timeout(10000).get();
      String title = doc.title();
      System.out.printf("title:%s\n", title);

      //			Elements eles = doc.select("meta[name~=(?i)keywords|(?i)description]");

      Elements eles = doc.select("meta");
      System.out.println(eles.size());
      for (Element ele : eles) {
        if (StringUtils.containsIgnoreCase(url, title)) ;
        if (ele.toString().matches(".*(?i)keywords.*")) {
          System.out.println(ele.attr("content"));
        }
        //				System.out.println(ele.attr("content"));
      }

      //			Elements eles = doc.getElementsByTag("meta");
      //			for (Element ele : eles) {
      //				System.out.printf("keys:%s\n", ele.attr("keywords"));
      //				System.out.printf("desc:%s\n", ele.attr("description"));
      //				System.out.println("----------------");
      //			}
      doc = null;
    } catch (Exception e) {
      e.printStackTrace();
    }
  }
Beispiel #5
0
 private URL getGalleryFromImage(URL url) throws IOException {
   Document doc = Http.url(url).get();
   for (Element link : doc.select("a[href~=^gallery\\.php.*$]")) {
     logger.info("LINK: " + link.toString());
     if (link.hasAttr("href") && link.attr("href").contains("gallery.php")) {
       url = new URL("http://imagearn.com/" + link.attr("href"));
       logger.info("[!] Found gallery from given link: " + url);
       return url;
     }
   }
   throw new IOException("Failed to find gallery at URL " + url);
 }
  public void download(Connection aInConnection, Collection<Image> images) throws IOException {
    aInConnection.url(url);
    Document lDocument = aInConnection.get();
    Element lMain = lDocument.getElementById("main");
    Elements lContents = lMain.getElementsByClass("content");

    if (lContents.size() == 1) {
      StringBuilder sb = new StringBuilder();
      Element lContent = lContents.first();

      collectImages(lContent, images);

      Elements lLightboxElements = lContent.getElementsByClass("lightbox");
      for (Element lLightboxElement : lLightboxElements) {
        Collection<Node> lImageNodes = extractImageNodes(lLightboxElement);

        Element lParent = lLightboxElement.parent();
        int i = lLightboxElement.siblingIndex();
        lParent.insertChildren(i, lImageNodes);
        lLightboxElement.remove();
      }

      Elements lChildElements = lContent.children();
      for (Element lChildElement : lChildElements) {
        if (lChildElement.hasClass("clear")) {
          // no more post content
          break;
        }

        if (title == null && lChildElement.tagName().equals("h1")) {
          // the first h1 header is the title
          title = lChildElement.html();
        } else {
          if (excerpt == null && lChildElement.tagName().equals("p")) {
            excerpt = lChildElement.text();
          }
          String lStr = lChildElement.toString();
          sb.append(lStr);
        }
      }

      content = sb.toString();

      Elements lDateElements = lContent.getElementsByClass("date");
      String lHunDate = lDateElements.first().html();
      date = new PostDate(lHunDate);
    } else {
      System.out.println("More than one content in main section of post page " + toString());
    }
  }
Beispiel #7
0
 /**
  * getMovieDescription parses through the movie's page html and returns the poster url link.
  *
  * @author defq0n
  * @param pageLink is the extended imdb url for the movie page.
  * @return posterLink String containing the poster url link.
  */
 private static String getPosterLink(String pageLink) {
   String posterLink = "";
   try {
     Document d = Jsoup.connect("http://imdb.com" + pageLink).get();
     Element e = d.body();
     String html = e.toString();
     String posterDiv = "";
     for (int i = html.indexOf("class=\"image\">") + 14;
         i < html.indexOf("<div class=\"pro-title-link text-center\">");
         i++) {
       posterDiv += html.charAt(i);
     }
     for (int i = posterDiv.indexOf("src=\"") + 5; i < posterDiv.indexOf(".jpg\"") + 4; i++) {
       posterLink += posterDiv.charAt(i);
     }
   } catch (Exception e) {
     System.out.println(e.toString());
   }
   return posterLink;
 }
Beispiel #8
0
 /**
  * getMovieDescription parses through the movie's page html and returns the description.
  *
  * @author defq0n
  * @param pageLink is the extended imdb url for the movie page.
  * @return movieDescription String containing the description
  */
 private static String getMovieDescription(String pageLink) {
   String movieDescription = "";
   try {
     Document d = Jsoup.connect("http://imdb.com" + pageLink).get();
     Element e = d.body();
     String html = e.toString();
     String descriptionDiv = "";
     for (int i = html.indexOf("description\">") + 13;
         i < html.indexOf("<div class=\"txt-block\" itemprop=\"director\"");
         i++) {
       descriptionDiv += html.charAt(i);
     }
     for (int i = 0; i < descriptionDiv.indexOf("</p>"); i++) {
       movieDescription += descriptionDiv.charAt(i);
     }
   } catch (Exception e) {
     System.out.println(e.toString());
   }
   return movieDescription;
 }
Beispiel #9
0
  @Override
  void parseUrl(String url) throws IOException {

    doc = Jsoup.connect(url).timeout(0).get();
    Elements elements =
        doc.select("table")
            .select("tbody")
            .select("tr")
            .select("td")
            .select("table")
            .select("tbody")
            .select("tr")
            .select("td")
            .select("a[href]");
    for (Element e : elements) {
      if (!e.toString().contains("cart")) myProdLinks.add(e.attr("abs:href"));
    }
    // System.out.println(myProdLinks);
    // System.out.println(myProdLinks.size());
  }
Beispiel #10
0
 /**
  * parseMoviesHTMl(String) takes a formatted title query and returns a string array of movie HTML
  * source code. Maximum of 10 movies. //TODO Doesn't always work but works for most cases, HTML
  * tends to change for some results, will be changed when the reason is found out.
  *
  * @author defq0n
  * @param titleQuery Formatted title query.
  * @return moviesHTMl String array of each HTML source.
  */
 public static String[] parseMoviesHTML(String titleQuery) {
   String moviesHTML[] = {
     "", "", "", "", "", "", "", "", "", ""
   }; // List of movies in HTML, limied to 10, initalized for easy string addition
   try {
     // Get the document using JSoup
     Document d =
         Jsoup.connect("http://www.imdb.com/find?ref_=nv_sr_fn&q=" + titleQuery + "&s=all").get();
     // Get the HTML body element
     Element e = d.body();
     // Declare Variables
     String xhtml = e.toString(); // HTML of body element
     String tbody = ""; // HTML of tbody element
     // get tbody html and store in #tbody
     for (int i = xhtml.indexOf("<tbody>"); i < xhtml.indexOf("</tbody>"); i++) {
       tbody += xhtml.charAt(i);
     }
     // loop over tbody to find how many movies there are and store the results in #moviesHTML
     int counter = 0; // counter for while loop
     while (counter < 10) { // hard code 10 because thats the maximum amount in #moviesHTML
       for (int i = tbody.indexOf("<td class=\"result_text\">") + 24;
           i < tbody.indexOf(") </td>") + 7;
           i++) {
         moviesHTML[counter] += tbody.charAt(i);
       }
       // now we have to reset tbody for the next
       String temp_tbody = "";
       for (int i = tbody.indexOf(") </td>") + 7; i < tbody.length(); i++) {
         temp_tbody += tbody.charAt(i);
       }
       // set tbody to the temporary one to get rid of the previous result
       tbody = temp_tbody;
       // index counter for next movie
       counter++;
     }
   } catch (Exception e) {
     System.out.println(e.getStackTrace()[1]);
   }
   return moviesHTML;
 }
Beispiel #11
0
 private static String getGenre(String pageLink) {
   String genre = "";
   try {
     Document d = Jsoup.connect("http://imdb.com" + pageLink).get();
     Element e = d.body();
     String html = e.toString();
     String genreDiv = "";
     for (int i = html.indexOf("itemprop=\"genre\"");
         i < html.indexOf("itemprop=\"description\"");
         i++) {
       genreDiv += html.charAt(i);
     }
     for (int i = genreDiv.indexOf("itemprop=\"genre\"") + 17;
         i < genreDiv.indexOf("</span>");
         i++) {
       genre += genreDiv.charAt(i);
     }
   } catch (Exception e) {
     System.out.println(e.toString());
   }
   return genre;
 }
  @SuppressWarnings("unchecked")
  protected void doGet(HttpServletRequest request, HttpServletResponse response)
      throws ServletException, IOException {
    // 0.init
    String captchaURL = null;
    String captchaImage = null;
    BasicCookieStore cookieStore = new BasicCookieStore();
    // CloseableHttpClient httpClient = HttpClients.createDefault();
    CloseableHttpClient httpClient =
        HttpClients.custom().setDefaultCookieStore(cookieStore).build();

    // 1.Send Get request header to server.
    // Get the response Html page.
    System.out.println("==========Send Request to e-can server==========");
    HttpGet httpGet = new HttpGet("https://www.e-can.com.tw/reservationUNMember_online.aspx");
    httpGet.addHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
    httpGet.addHeader("Accept-Encoding", "gzip, deflate");
    httpGet.addHeader("Accept-Language", "zh-TW,zh;q=0.8,en-US;q=0.5,en;q=0.3");
    httpGet.addHeader("Connection", "Keep-Alive");
    httpGet.addHeader("Host", "www.e-can.com.tw");
    httpGet.addHeader("User-Agent", "Mozilla");
    CloseableHttpResponse resp = httpClient.execute(httpGet);
    System.out.println();

    // show server response status > GET 200 OK
    System.out.println(resp.getStatusLine());
    for (Header h : resp.getAllHeaders()) {
      System.out.println(h);
    }
    System.out.println("**********End of Headers********** \n\n");

    HttpEntity entity = resp.getEntity();
    // show entity
    // System.out.println("entity="+entity);

    String html = EntityUtils.toString(resp.getEntity());
    // show html page
    // System.out.println(html);

    // 2.Use Jsoup to parse html page.
    // Select cssString to get captchaURL and captchaKey source.
    Document htmlDoc = Jsoup.parse(html);
    Elements elementEventTarget = htmlDoc.select("#__EVENTTARGET");
    Elements elementEventArgument = htmlDoc.select("#__EVENTARGUMENT");
    Elements elementViewState = htmlDoc.select("#__VIEWSTATE");
    Elements elementViewStateGenerrator = htmlDoc.select("#__VIEWSTATEGENERATOR");
    Elements elementddlGetdate = htmlDoc.select("#ddlGetdate > option");
    // info for post later
    String __EVENTTARGET = elementEventTarget.val();
    String __EVENTARGUMENT = elementEventArgument.val();
    String __VIEWSTATE = elementViewState.val();
    String __VIEWSTATEGENERATOR = elementViewStateGenerrator.val();
    int count = 0;
    JSONObject joOption = new JSONObject();
    for (Element e : elementddlGetdate) {
      joOption.put(count++, e.toString());
    }
    System.out.println("joOption = " + joOption);

    Elements elementCaptcha = htmlDoc.select("#captcha");
    System.out.println(elementCaptcha.attr("src"));
    captchaURL = "https://www.e-can.com.tw/" + elementCaptcha.attr("src");

    // show URL
    System.out.println("captchaURL=" + captchaURL);

    // 3.Send GET request to get the captchaImage source.
    // Encode source to base64 String.
    System.out.println("==========Send request to e-can for captcha image==========");
    httpGet = new HttpGet(captchaURL);
    httpGet.addHeader("Referer", "https://www.e-can.com.tw/reservationUNMember_online.aspx");
    httpGet.addHeader("Accept", "image/png,image/*;q=0.8,*/*;q=0.5");
    httpGet.addHeader("Accept-Encoding", "gzip, deflate");
    httpGet.addHeader("Accept-Language", "zh-TW,zh;q=0.8,en-US;q=0.5,en;q=0.3");
    httpGet.addHeader("Connection", "Keep-Alive");
    httpGet.addHeader("Host", "www.e-can.com.tw");
    httpGet.addHeader("User-Agent", "Mozilla");
    resp = httpClient.execute(httpGet);
    System.out.println();

    List<Cookie> cookies = cookieStore.getCookies();
    if (cookies.isEmpty()) {
      System.out.println("None");
    } else {
      for (int i = 0; i < cookies.size(); i++) {
        System.out.println("- " + cookies.get(i).toString());
      }
    }
    System.out.println("cookieName= " + cookies.get(0).getName());
    System.out.println("cookieValue= " + cookies.get(0).getValue());
    System.out.println();

    System.out.println(resp.getStatusLine());
    for (Header h : resp.getAllHeaders()) {
      System.out.println(h);
    }
    System.out.println("**********End of Headers********** \n\n");
    entity = resp.getEntity();
    InputStream instream = entity.getContent();
    ByteArrayOutputStream baos = new ByteArrayOutputStream();
    byte[] bytes = new byte[instream.available()];
    int reads = instream.read();
    while (reads != -1) {
      baos.write(reads);
      reads = instream.read();
    }
    bytes = baos.toByteArray();

    captchaImage = "data:image/png;base64," + new BASE64Encoder().encode(bytes);
    // show captchaImage of base64 code
    // System.out.println(captchaImage);
    EntityUtils.consume(entity);

    // 4.Use Json format to wrap url, key and source string.
    // Then send response to ajax request from index.jsp
    JSONObject jo = new JSONObject();
    jo.put("__EVENTTARGET", __EVENTTARGET);
    jo.put("__EVENTARGUMENT", __EVENTARGUMENT);
    jo.put("__VIEWSTATE", __VIEWSTATE);
    jo.put("__VIEWSTATEGENERATOR", __VIEWSTATEGENERATOR);
    jo.put("captchaURL", captchaURL);
    jo.put("captchaImage", captchaImage);
    jo.put(cookies.get(0).getName(), cookies.get(0).getValue());
    String bothJson = "[" + jo + "," + joOption + "]";
    response.setContentType("application/json");
    response.setCharacterEncoding("utf-8");
    PrintWriter out = response.getWriter();
    out.print(bothJson);
    out.flush();
  }
  public ArrayList<JuickMessage> parseWebMessageListPure(String htmlStr) {
    ArrayList<JuickMessage> retval = new ArrayList<JuickMessage>();

    Document parsed = Jsoup.parse(htmlStr);
    Elements posts = parsed.select("div");
    ISimpleDateFormat sdf;
    ISimpleDateFormat sdf2;
    sdf = DevJuickComMessages.sdftz.createSDF("yyyy dd MMM HH:mm", "en", "US", "UTC");
    sdf2 = DevJuickComMessages.sdftz.createSDF("yyyy dd MMM HH:mm", "ru", "RU", "UTC");
    Calendar cal = Calendar.getInstance();
    int currentYear = cal.get(Calendar.YEAR);
    for (Element post : posts) {
      String postClass = post.attr("class");
      if (postClass.equals("post") || postClass.startsWith("post ")) {
        PointMessage message = new PointMessage();
        message.User = new JuickUser();
        message.User.UName = post.select("div[class=info] > a > img").attr("alt");
        if (message.User.UName.length() == 0) {
          message.User.UName = post.select("div[class=author] > a").text();
        }
        String dataId = post.attr("data-id");
        String dataCommentId = post.attr("data-comment-id");
        String dataToCommentId = post.attr("data-to-comment-id");
        message.setMID(new PointMessageID(message.User.UName, dataId, 0));
        if (dataCommentId.length() > 0) {
          message.setRID(Integer.parseInt(dataCommentId));
        }
        if (dataToCommentId.length() > 0) {
          message.setReplyTo(Integer.parseInt(dataToCommentId));
        }
        message.tags = new Vector<String>();
        for (Element el : post.select("a[class=tag]")) {
          message.tags.add(el.text());
        }
        message.microBlogCode = PointMessageID.CODE;
        StringBuilder dt = new StringBuilder();
        for (Element el : post.select("div[class=created]")) {
          dt.append(" ");
          dt.append(el.text());
        }
        try {
          message.Timestamp = new Date(sdf.parse(currentYear + " " + dt.toString().trim()));
        } catch (IllegalArgumentException e) {
          try {
            message.Timestamp = new Date(sdf2.parse(currentYear + " " + dt.toString().trim()));
          } catch (IllegalArgumentException e1) {
            continue;
          }
        }
        Date mt = message.Timestamp;
        if (mt.getTime() > System.currentTimeMillis() + 50 * 24 * 60 * 60 * 1000L) {
          Calendar cal2 = Calendar.getInstance();
          cal2.setTime(mt);
          cal2.set(Calendar.YEAR, cal2.get(Calendar.YEAR) - 1);
          message.Timestamp = cal2.getTime();
        }
        Elements postEls = post.select("div[class=text-content]");
        if (postEls.size() < 1) {
          postEls = post.select("div[class=text]");
        }
        String referencedImages = "";
        Elements postimg = post.select("a[class=postimg]");
        for (Element as : postimg) {
          Elements imgs = as.select("img");
          for (Element img : imgs) {
            String src = img.attr("src");
            referencedImages += " " + src;
          }
        }
        message.csrf_token = post.select("input[name=csrf_token]").attr("value");
        // last part
        if (postEls.size() < 1) {
          message.Text = "Error parsing text ;-(";
        } else {
          Element elem = cleanupElement(postEls.get(0));
          postEls.get(0).appendChild(elem); // add to document
          Document.OutputSettings os = elem.ownerDocument().outputSettings();
          os.prettyPrint(false);
          String text = Utils.replace(elem.toString(), "\n", " ");
          text = Utils.replace(text, "&amp;", "&"); // this was improperly done in cleanupElement
          while (true) {
            long olds = text.length();
            text = Utils.replace(text, "  ", " ");
            long news = text.length();
            if (news == olds) break;
          }
          try {
            message.replies = Integer.parseInt(post.select("span[class=cn]").text());
          } catch (Exception ex) {
          }
          text += referencedImages;
          message.Text = unwebMessageTextPoint(text);
        }
        retval.add(message);
      }
    }

    for (JuickMessage juickMessage : retval) {
      if (juickMessage.getRID() != 0
          && juickMessage.getReplyTo() != 0
          && !juickMessage.Text.startsWith("@")) {
        String uzur = null;
        for (JuickMessage scan : retval) {
          if (scan.getRID() == juickMessage.getReplyTo()) {
            uzur = scan.User.UName;
            break;
          }
        }
        if (uzur != null) {
          juickMessage.Text = "@" + uzur + " " + juickMessage.Text;
        }
      }
    }

    return retval;
  }
  // start setting of selectorbar
  public void selectorBarTranslate(
      Node selectorBarPanelNode, Element ele, Map<String, String> urlMap, String locale) {

    try {
      String title = (ele != null ? ele.getElementsByTag("a").first().text() : "");
      String titleUrl = ele.getElementsByTag("a").first().absUrl("href");
      if (StringUtil.isBlank(titleUrl)) {
        titleUrl = ele.getElementsByTag("a").first().attr("href");
      }
      // Start extracting valid href
      log.debug("Before selector bar title LinkUrl" + titleUrl + "\n");
      titleUrl = FrameworkUtils.getLocaleReference(titleUrl, urlMap, locale, sb);
      log.debug("after selector bar title LinkUrl" + titleUrl + "\n");
      // End extracting valid href
      log.debug("selector component titleUrl: " + titleUrl);
      selectorBarPanelNode.setProperty("title", title);
      selectorBarPanelNode.setProperty("titleurl", titleUrl);
      if (ele.childNodeSize() >= 2) {
        log.debug("Child node size is greater than 1.");
        if (ele.select("div.menu").isEmpty()) {
          log.debug("Menu is not available.");
          sb.append(
              "<li>Selector bar drop down menu elements does not exist on the locale page.</li>");
        } else {
          log.debug("Menu is available.");
          Element menuEle = ele.child(1);
          if (menuEle != null) {
            log.debug("selector component menuEle: " + menuEle.toString());
            Element anchor = menuEle.getElementsByTag("a").last();
            String allLinkText = anchor != null ? anchor.text() : "";
            String allLinkUrl = anchor != null ? anchor.absUrl("href") : "";
            if (StringUtil.isBlank(allLinkUrl)) {
              allLinkUrl = anchor.attr("href");
            }
            // Start extracting valid href
            log.debug("Before selector bar menu LinkUrl" + allLinkUrl + "\n");
            allLinkUrl = FrameworkUtils.getLocaleReference(allLinkUrl, urlMap, locale, sb);
            log.debug("after selector bar menu LinkUrl" + allLinkUrl + "\n");
            // End extracting valid href
            selectorBarPanelNode.setProperty("alllinktext", allLinkText);
            selectorBarPanelNode.setProperty("alllinkurl", allLinkUrl);

            Elements menuUlList = menuEle.getElementsByTag("ul");
            for (Element element : menuUlList) {
              java.util.List<String> list = new ArrayList<String>();
              Elements menuLiList = element.getElementsByTag("li");
              System.out.println(menuLiList.size());

              for (Element li : menuLiList) {
                JSONObject jsonObj = new JSONObject();
                Element listItemAnchor = li.getElementsByTag("a").first();
                String anchorText = listItemAnchor != null ? listItemAnchor.text() : "";
                String anchorHref = listItemAnchor.absUrl("href");
                if (StringUtil.isBlank(anchorHref)) {
                  anchorHref = listItemAnchor.attr("href");
                }
                // Start extracting valid href
                log.debug("Before selectorbarLinkUrl" + anchorHref + "\n");
                anchorHref = FrameworkUtils.getLocaleReference(anchorHref, urlMap, locale, sb);
                log.debug("after selectorbarLinkUrl" + anchorHref + "\n");
                // End extracting valid href

                jsonObj.put("linktext", anchorText);
                jsonObj.put("linkurl", anchorHref);
                jsonObj.put("size", "");
                list.add(jsonObj.toString());
              }

              selectorBarPanelNode.setProperty("panelitems", list.toArray(new String[list.size()]));
            }
          } else {
            sb.append(
                "<li>Selector bar drop down menu elements does not exist on the locale page.</li>");
          }
        }
      } else {
        sb.append(
            "<li>Selector bar drop down menu elements does not exist on the locale page.</li>");
      }

    } catch (Exception e) {
      e.printStackTrace();
    }
  }
Beispiel #15
0
  /**
   * 新闻的 url 格式为 http://see.xidian.edu.cn/html/news/7928.html
   *
   * @param id 某个新闻页面的序号
   * @return 爬取该页面上的新闻信息,提取相应的信息,存到新闻bean里。如果没有爬取到新闻返回null
   * @throws Exception
   */
  public static ArticleItem parseNewsItem(int id) throws Exception {
    // 根据后缀的数字,拼接新闻 url
    String urlStr = Constant.ARTICLE_BASE_URL + id + ".html";

    // 利用get请求获取字符串再解析会有小部分乱码
    // String htmlStr = HttpTool.doGet(urlStr);
    // Document doc = Jsoup.parse(htmlStr);
    // try {
    Document doc = Jsoup.connect(urlStr).timeout(10000).get();
    // 去掉jsoup对html字符串加的"\n",方便json字符串返回
    doc.outputSettings().prettyPrint(false);

    Element articleEle = doc.getElementById("article");
    // 标题
    Element titleEle = articleEle.getElementById("article_title");
    String titleStr = titleEle.text();

    // article_detail包括了 2016-01-15 来源: 浏览次数:177
    Element detailEle = articleEle.getElementById("article_detail");
    Elements details = detailEle.getElementsByTag("span");

    // 发布时间
    String dateStr = details.get(0).text();

    // 新闻来源
    String sourceStr = details.get(1).text();

    // 去掉"来源:"
    if (SOURCE_PREFIX.equals(sourceStr.trim())) {
      sourceStr = "SeeNews";
    } else {
      sourceStr = sourceStr.substring(3).trim();
    }

    // 访问这个新闻页面,浏览次数会+1,次数是 JS 渲染的
    String jsStr = HttpTool.doGet(COUNT_BASE_URL + id);
    int readTimes = Integer.parseInt(jsStr.replaceAll("\\D+", ""));
    // 或者使用下面这个正则方法
    // String readTimesStr = jsStr.replaceAll("[^0-9]", "");

    Element contentEle = articleEle.getElementById("article_content");
    // 新闻主体内容

    String contentStr = contentEle.toString();

    // 如果用 text()方法,新闻主体内容的 html 标签会丢失
    // 为了在 Android 上用 WebView 显示 html,用toString()
    // String contentStr = contentEle.text();
    Elements images = contentEle.getElementsByTag("img");
    String[] imageUrls = new String[images.size()];

    // 图片上传到七牛
    // 将body中的图片地址替换为七牛的地址
    for (int i = 0; i < imageUrls.length; i++) {
      String origin = images.get(i).attr("src");
      imageUrls[i] = ImageTool.convertUrl(id, origin);
      if (!origin.equals(imageUrls[i])) {
        // 只有上传图片到七牛,url 才会变化
        // 不相等,才替换为七牛的url
        contentStr =
            contentStr.replace(
                Constant.SRC_PREFIX + origin,
                Constant.SRC_PREFIX + Constant.BUCKET_HOST_NAME + imageUrls[i]);
      }
    }

    // 处理相对路径 url,不和上面的 image url 冲突
    Elements hrefs = contentEle.getElementsByTag("a");
    for (int i = 0; i < hrefs.size(); i++) {
      String origin = hrefs.get(i).attr("href");
      if (Constant.DEBUG) {
        System.out.println("原始 href=" + origin);
      }
      String newUrl = UrlTool.dealAttachmentUrl(id, origin);

      // 防止页面的附件 重复出现,替换多次
      // 出现这种
      // http://see.xidian.edu.cnhttp://see.xidian.edu.cn/uploads/file
      if (!origin.equals(newUrl)) {
        // 不相等,才替换为新的url 且url未被替换过
        contentStr =
            contentStr.replace(Constant.HREF_PREFIX + origin, Constant.HREF_PREFIX + newUrl);
      }
    }

    return new ArticleItem(id, imageUrls, titleStr, dateStr, readTimes, sourceStr, contentStr);
  }
  @Override
  public MediaMetadata getMetadata(MediaScrapeOptions options) throws Exception {
    LOGGER.debug("getMetadata() " + options.toString());

    if (options.getType() != MediaType.MOVIE) {
      throw new UnsupportedMediaTypeException(options.getType());
    }

    String id = "";
    if (StringUtils.isNotBlank(options.getId(providerInfo.getId()))) {
      id = options.getId(providerInfo.getId());
    }

    if (StringUtils.isBlank(id) && options.getResult() != null) {
      if (StringUtils.isEmpty(options.getResult().getId())) {
        id = StrgUtils.substr(options.getResult().getUrl(), "id=(.*?)");
      } else {
        id = options.getResult().getId();
      }
    }

    // we can not scrape without zelluloid id and url
    if (StringUtils.isBlank(id) && StringUtils.isBlank(options.getResult().getUrl())) {
      throw new Exception("cannot scrape without id and url");
    }

    String detailurl = BASE_URL + "/filme/index.php3?id=" + id;
    if (StringUtils.isBlank(id)) {
      detailurl = options.getResult().getUrl();
    }

    MediaMetadata md = new MediaMetadata(providerInfo.getId());

    Url url;
    try {
      url = new CachedUrl(detailurl);
      InputStream in = url.getInputStream();
      Document doc = Jsoup.parse(in, PAGE_ENCODING, "");
      in.close();

      // parse title
      String title = doc.getElementsByAttributeValue("property", "og:title").attr("content").trim();
      md.setTitle(title);

      // parse plot
      String plot = doc.getElementsByAttributeValue("class", "bigtext").text();
      md.setPlot(plot);
      md.setTagline(plot.length() > 150 ? plot.substring(0, 150) : plot);

      // parse poster
      Elements el = doc.getElementsByAttributeValueStarting("src", "/images/poster");
      if (el.size() == 1) {
        // Poster
        MediaArtwork ma =
            new MediaArtwork(providerInfo.getId(), MediaArtwork.MediaArtworkType.POSTER);
        ma.setPreviewUrl(BASE_URL + el.get(0).attr("src"));
        ma.setDefaultUrl(BASE_URL + el.get(0).attr("src"));
        ma.setLanguage(options.getLanguage().getLanguage());
        md.addMediaArt(ma);
      }

      // parse year
      el = doc.getElementsByAttributeValueContaining("href", "az.php3?j=");
      if (el.size() == 1) {
        try {
          md.setYear(Integer.parseInt(el.get(0).text()));
        } catch (Exception ignored) {
        }
      }

      // parse cinema release
      el = doc.getElementsByAttributeValueContaining("href", "?v=w");
      if (el.size() > 0) {
        try {
          SimpleDateFormat sdf = new SimpleDateFormat("dd.MM.yyyy");
          Date d = sdf.parse(el.get(0).text());
          md.setReleaseDate(d);
        } catch (Exception e) {
          LOGGER.warn("cannot parse cinema release date: " + el.get(0).text());
        }
      }

      // parse original title
      md.setOriginalTitle(StrgUtils.substr(doc.toString(), "Originaltitel: (.*?)\\<"));

      if (StringUtils.isEmpty(md.getOriginalTitle())) {
        md.setOriginalTitle(md.getTitle());
      }

      // parse runtime
      String rt = (StrgUtils.substr(doc.toString(), "ca.&nbsp;(.*?)&nbsp;min"));
      if (!rt.isEmpty()) {
        try {
          md.setRuntime(Integer.valueOf(rt));
        } catch (Exception e2) {
          LOGGER.warn("cannot convert runtime: " + rt);
        }
      }

      // parse genres
      el = doc.getElementsByAttributeValueContaining("href", "az.php3?g=");
      for (Element g : el) {
        String gid = g.attr("href").substring(g.attr("href").lastIndexOf('=') + 1);
        md.addGenre(getTmmGenre(gid));
      }

      // parse cert
      // FSK: ab 12, $230 Mio. Budget
      String fsk = StrgUtils.substr(doc.toString(), "FSK: (.*?)[,<]");
      if (!fsk.isEmpty()) {
        md.addCertification(Certification.findCertification(fsk));
      }

      // parse rating
      Elements ratings = doc.getElementsByAttributeValue("class", "ratingBarTable");
      if (ratings.size() == 2) { // get user rating
        Element e = ratings.get(1);
        // <div>87%</div>
        String r = e.getElementsByTag("div").text().replace("%", "");
        try {
          md.setRating(Float.valueOf(r) / 10); // only 0-10
        } catch (Exception e2) {
          LOGGER.warn("cannot convert rating: " + r);
        }
      }

      // details page
      doc = null;
      String detailsUrl = BASE_URL + "/filme/details.php3?id=" + id;
      try {
        url = new CachedUrl(detailsUrl);
        in = url.getInputStream();
        doc = Jsoup.parse(in, PAGE_ENCODING, "");
        in.close();
      } catch (Exception e) {
        LOGGER.error("failed to get details: " + e.getMessage());
      }

      if (doc != null) {
        Element tab = doc.getElementById("ccdetails");
        int header = 0;
        String lastRole = "";
        for (Element tr : tab.getElementsByTag("tr")) {
          if (tr.toString().contains("dyngfx")) { // header gfx
            if (tr.toString().contains("Besetzung")) {
              header = 1;
            } else if (tr.toString().contains("Crew")) {
              header = 2;
            } else if (tr.toString().contains("Produktion")) { // company, not producers
              header = 3;
            } else if (tr.toString().contains("Verleih")) {
              header = 4;
            } else if (tr.toString().contains("Alternativtitel")) {
              header = 5;
            }
            continue;
          } else {
            // no header gfx, so data
            MediaCastMember mcm = new MediaCastMember();
            el = tr.getElementsByTag("td");
            if (header == 1) {
              // actors
              if (el.size() == 2) {
                String role = "" + el.get(0).text().trim();
                // text() decodes &nbsp; to \u00a0
                if (role.equals("\u00a0") || StringUtils.isBlank(role)) {
                  continue;
                }
                mcm.setCharacter(role);
                mcm.setName(el.get(1).getElementsByTag("a").text());
                mcm.setId(
                    StrgUtils.substr(el.get(1).getElementsByTag("a").attr("href"), "id=(\\d+)"));
                mcm.setType(MediaCastMember.CastType.ACTOR);
                md.addCastMember(mcm);
                // parsing actor pages would we too heavy here just for actor images..
              }
            } else if (header == 2) {
              // crew
              if (el.size() == 2) {
                String crewrole = el.get(0).html().trim();
                mcm.setName(el.get(1).getElementsByTag("a").text());
                if (crewrole.equals("&nbsp;")) {
                  crewrole = lastRole; // pop previous
                } else {
                  lastRole = crewrole; // push new
                }
                mcm.setPart(crewrole);
                switch (crewrole) {
                  case "Regie":
                    mcm.setType(MediaCastMember.CastType.DIRECTOR);
                    break;
                  case "Drehbuch":
                    mcm.setType(MediaCastMember.CastType.WRITER);
                    break;
                  case "Produktion":
                    mcm.setType(MediaCastMember.CastType.PRODUCER);
                    break;
                  default:
                    mcm.setType(MediaCastMember.CastType.OTHER);
                    break;
                }
                mcm.setId(
                    StrgUtils.substr(el.get(1).getElementsByTag("a").attr("href"), "id=(\\d+)"));
                md.addCastMember(mcm);
              }
            } else if (header == 3) {
              // production
              md.addProductionCompany(el.get(0).text());
            }
          }
        }
      }

      // get links page
      doc = null;
      String linksUrl = BASE_URL + "/filme/links.php3?id=" + id;
      try {
        url = new CachedUrl(linksUrl);
        in = url.getInputStream();
        doc = Jsoup.parse(in, PAGE_ENCODING, "");
        in.close();
      } catch (Exception e) {
        LOGGER.error("failed to get links page: " + e.getMessage());
      }

      if (doc != null) {
        el = doc.getElementsByAttributeValueContaining("href", "german.imdb.com");
        if (el != null && el.size() > 0) {
          String imdb = StrgUtils.substr(el.get(0).attr("href"), "(tt\\d{7})");
          if (imdb.isEmpty()) {
            imdb = "tt" + StrgUtils.substr(el.get(0).attr("href"), "\\?(\\d+)");
          }
          md.setId(MediaMetadata.IMDB, imdb);
        }
      }
    } catch (Exception e) {
      LOGGER.error("Error parsing " + detailurl);

      throw e;
    }

    return md;
  }