Esempio n. 1
0
  public static void getComic(String arg) {
    Document doc;

    try {
      doc = Jsoup.connect(arg).get();

      // String title = doc.title();
      // System.out.print("Title: " + title);

      // Select the img tag in the comic id
      Elements links = doc.select("#comic img");
      System.out.print("\nComic Name : " + links.attr("alt"));
      System.out.print("\nImage Source : " + links.attr("src") + "\n\n");
      URL url = new URL(links.attr("src"));

      RenderedImage comic = ImageIO.read(url);
      String baseName = links.attr("alt").replaceAll("\\s", "_");

      ImageIO.write(comic, "png", new File("/home/paranoidsp/Pictures/xkcd/" + baseName + ".png"));

      /*
       * Unfortunately, the transcript isn't formatted, so I get one
       * large line of text instead of readable dialogue.
       * TODO: Fix this. Find a way to get it.
       *
      Elements transcript = doc.select("#transcript");
      System.out.print("Transcript: \n" + transcript.text());
      */

    } catch (IOException exp) {
      exp.printStackTrace();
    }
  }
Esempio n. 2
0
  private void parseFeedItem(String resource) {
    try {
      Document doc = Jsoup.parse(resource);
      Element masthead = doc.select("div.tie-wrapper").first();
      Elements feedBoxs = masthead.select("div.tie-box");

      for (int i = 0; i < feedBoxs.size(); i++) {

        FeedItem feedItem = new FeedItem();

        Element feedPost = feedBoxs.get(i);

        Element titleElement = feedPost.select("div.tie-header h2.tie-title a").first();

        Element nameElement =
            feedPost.select("div.tie-content div.tie-user div.user-info p span.user-name").first();
        Element sourceElement =
            feedPost.select("div.tie-content div.tie-user div.user-info p span.user-form").first();
        Element timestampElement =
            feedPost.select("div.tie-content div.tie-user div.user-info p.tie-date").first();
        Elements imageElement = feedPost.select("div.tie-content img.st-photo");
        Elements contentElements = feedPost.select("div.tie-content p:not(.tie-date):gt(0)");

        String title = titleElement.text();
        String name = nameElement.text();
        String source = sourceElement.text();
        String timestamp = timestampElement.text();

        String content = "";
        for (int j = 0; j < contentElements.size(); j++) {
          content = content + contentElements.get(j).text() + "\n";
        }

        String image;
        if (imageElement.attr("src") != "") {
          image = url + imageElement.attr("src");
        } else {
          image = null;
        }

        feedItem.setTitle(title);
        feedItem.setName(name);
        feedItem.setPostTime(timestamp);
        feedItem.setSource(source);
        feedItem.setImage(image);
        feedItem.setContent(content);

        mFeedItems.add(feedItem);
      }
    } catch (Exception e) {
      e.printStackTrace();
    }
    mFeedItemAdapter.notifyDataSetChanged();
  }
Esempio n. 3
0
  private void getDatafromJsoup(String url) {
    // TODO Auto-generated method stub
    try {
      Document doc = Jsoup.connect(url).get();
      // Elements content = doc.getElementsByClass("cell item");

      Elements header = doc.getElementsByClass("topic_content");

      Log.e("topic_content", header.text());
      title = header.text();

      Elements content = doc.getElementsByTag("tbody");
      for (Element link : content) {

        DetailEntity entity = new DetailEntity();

        Elements avatar = link.getElementsByTag("img");
        {
          String avaterLink = avatar.attr("src");
          if (avaterLink.startsWith("//cdn.")) {
            entity.setAvater("http:" + avaterLink);
          }
        }

        Elements reply_content = link.getElementsByClass("reply_content");

        Log.e("reply_content", reply_content.text());

        entity.setReply_count(reply_content.text());

        Elements title = link.getElementsByTag("a");
        if (title.attr("href").startsWith("/member/")) {

          Log.e("title", title.text());
          entity.setTitle(title.text());
        }

        Log.e(
            "other",
            link.getElementsByClass("fade small").text()
                + link.getElementsByClass("small fade").text());
        if (!TextUtils.isEmpty(reply_content.text())) entities.add(entity);
      }

    } catch (IOException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }
  }
    private boolean updateDailyNews(Document doc, String dailyTitle, DailyNews dailyNews)
        throws JSONException {
      Elements viewMoreElements = doc.getElementsByClass("view-more");

      if (viewMoreElements.size() > 1) {
        dailyNews.setMulti(true);
        Elements questionTitleElements = doc.getElementsByClass("question-title");

        for (int j = 0; j < viewMoreElements.size(); j++) {
          if (questionTitleElements.get(j).text().length() == 0) {
            dailyNews.addQuestionTitle(dailyTitle);
          } else {
            dailyNews.addQuestionTitle(questionTitleElements.get(j).text());
          }

          Elements viewQuestionElement = viewMoreElements.get(j).select("a");

          if (viewQuestionElement.text().equals("查看知乎讨论")) {
            dailyNews.addQuestionUrl(viewQuestionElement.attr("href"));
          } else {
            return false;
          }
        }
      } else if (viewMoreElements.size() == 1) {
        dailyNews.setMulti(false);

        Elements viewQuestionElement = viewMoreElements.select("a");
        if (viewQuestionElement.text().equals("查看知乎讨论")) {
          dailyNews.setQuestionUrl(viewQuestionElement.attr("href"));
        } else {
          return false;
        }

        // Question title is the same with daily title
        if (doc.getElementsByClass("question-title").text().length() == 0) {
          dailyNews.setQuestionTitle(dailyTitle);
        } else {
          dailyNews.setQuestionTitle(doc.getElementsByClass("question-title").text());
        }
      } else {
        return false;
      }

      return true;
    }
Esempio n. 5
0
 public static ArrayList<EntryModel> getEpisodeLinks(final String url) {
   final ArrayList<EntryModel> result = new ArrayList<>();
   Thread thread =
       new Thread(
           () -> {
             try {
               Document document = Jsoup.connect(url).get();
               Elements titleElements =
                   document.getElementsByClass("video_option").size() != 0
                       ? document.getElementsByClass("video_option").first().getElementsByTag("a")
                       : document
                           .getElementsByClass("video_option_act")
                           .first()
                           .getElementsByTag("a");
               Elements videoElements = document.getElementsByClass("player_conte");
               for (int i = 0; i < videoElements.size(); i++) {
                 String title = titleElements.get(i).text();
                 String linkUrl = videoElements.get(i).attr("src");
                 if (!linkUrl.startsWith("http")) {
                   Elements flashElements = videoElements.get(i).select("param[name=flashvars]");
                   linkUrl = flashElements.attr("value").split("[&;]")[0].split("=")[1];
                   if (!linkUrl.startsWith("http")) {
                     flashElements = videoElements.get(i).select("[flashvars]");
                     linkUrl = flashElements.attr("flashvars").split("[&;]")[0].split("=")[1];
                   }
                 }
                 result.add(new EntryModel(Constants.TYPE_LINK, title, linkUrl));
               }
             } catch (IOException | IndexOutOfBoundsException e) {
               e.printStackTrace();
             }
           });
   thread.start();
   try {
     thread.join();
     return result;
   } catch (InterruptedException | NullPointerException e) {
     e.printStackTrace();
     return null;
   }
 }
  @Test
  public void testGenerateManyToOneProperty() throws Exception {
    Map<String, Object> root =
        TestHelpers.createInspectionResultWrapper(ENTITY_NAME, MANY_TO_ONE_PROP);

    Resource<URL> templateResource =
        resourceFactory.create(
            getClass().getResource(Deployments.BASE_PACKAGE_PATH + Deployments.SEARCH_FORM_INPUT));
    TemplateProcessor processor =
        processorFactory.fromTemplate(new FreemarkerTemplate(templateResource));
    String output = processor.process(root);
    Document html = Jsoup.parseBodyFragment(output);
    assertThat(output.trim(), not(equalTo("")));

    Elements container = html.select("div.form-group");
    assertThat(container, notNullValue());

    Elements formInputElement = container.select("div.col-sm-10 > select");
    assertThat(formInputElement.attr("id"), equalTo("customer"));
    assertThat(formInputElement.attr("ng-model"), equalTo("search" + "." + "customer"));
  }
Esempio n. 7
0
 public static String[] productsIdList_get(String url) throws IOException {
   String[] arrList = null;
   Document doc = doc_get(url);
   if (doc != null) {
     Elements link_span =
         doc.select(
             "body>div.searchwrap.w980>div#bodyRight>#search_result>#plist>#search_table>#productsIdList");
     //			System.out.println(link_span.attr("value"));
     arrList = link_span.attr("value").split(",");
     return arrList;
   }
   return arrList;
 }
  protected void parseLoginStep2(SimpleObject context) {
    String text = ContextUtil.getContent(context);
    if (text == null) {
      return;
    }
    String phone1 = phoneNo;
    String password1 = password;

    String n = StringUtil.subStr("strEnc(username,", ");", text).trim();
    if (!StringUtils.isBlank(n)) {
      String[] stra = n.trim().replaceAll("\'", "").split(",");
      // pwd, digit, f, s
      phone1 = executeJsFunc("des/tel_com_des.js", "strEnc", phoneNo, stra[0], stra[1], stra[2]);
      password1 =
          executeJsFunc("des/tel_com_des.js", "strEnc", password, stra[0], stra[1], stra[2]);
    }
    Document doc = ContextUtil.getDocumentOfContent(context);

    Elements form = doc.select("form#c2000004");

    Request req = new Request(fixedFullUrl(form.attr("action")));
    req.setMethod("POST");
    req.initNameValuePairs(12);

    req.addNameValuePairs("lt", form.select("input[name=lt]").attr("value"));
    req.addNameValuePairs("_eventId", "submit");
    req.addNameValuePairs("forbidpass", "null");

    req.addNameValuePairs("areaname", areaName);
    req.addNameValuePairs("password", password1);
    req.addNameValuePairs("authtype", "c2000004");

    req.addNameValuePairs("customFileld01", customField1);

    req.addNameValuePairs("customFileld02", customField2);
    req.addNameValuePairs("forbidaccounts", "null");
    req.addNameValuePairs("open_no", "c2000004");
    req.addNameValuePairs("username", phone1);
    req.addNameValuePairs("randomId", authCode == null ? "" : authCode);
    req.setCharset(UAM_CHAR_SET);
    req.addObjservers(
        new AbstractProcessorObserver(util, WaringConstaint.ZGDX_3) {
          @Override
          public void afterRequest(SimpleObject context) {
            parseLoginStep3(context);
          }
        });
    spider.addRequest(req);
  }
Esempio n. 9
0
  @Override
  protected ArrayList<object> doInBackground(Integer... integers) {
    if (android.os.Build.VERSION.SDK_INT > 9) {
      StrictMode.ThreadPolicy policy = new StrictMode.ThreadPolicy.Builder().permitAll().build();
      StrictMode.setThreadPolicy(policy);
      Document doc;
      mArrayList = new ArrayList<object>();
      try {
        doc = Jsoup.connect(url).timeout(10 * 1000).get();
        Element element = doc.body();
        Elements elements = element.getElementsByClass("item");
        Log.i(null, "size" + elements.size());
        for (Element link : elements) {
          Elements els_links = link.select("a[href]");
          Log.i(null, "links" + els_links.attr("href"));
          Elements els_linksw = link.select("img");
          Element element1 = link.getElementsByClass("item-con").first();
          Element element2 = element1.getElementsByTag("p").first();
          Log.i(null, "img" + els_linksw.attr("src"));

          object _util = new object();
          _util.setImg(els_linksw.attr("src"));
          //                    String str[]=;
          _util.setTxt(element2.text());
          _util.setUrl(els_links.attr("href"));
          mArrayList.add(_util);
        }

      } catch (Exception e) {
        Log.i(null, "exception" + e);
      } finally {

      }
    }
    return mArrayList;
  }
Esempio n. 10
0
  public String parserSessionID(Document document) {
    Elements logout;
    String sessionid;
    int q;

    logout = document.select("a[href^=logout.php?]");
    if (logout.size() == 0) return null;

    sessionid = logout.attr("href");
    q = sessionid.indexOf("session_id=");
    sessionid = sessionid.substring(q + 11);
    q = sessionid.indexOf("&");
    if (q >= 0) sessionid = sessionid.substring(0, q);

    return sessionid;
  }
Esempio n. 11
0
  public static List<CaoImg> getImgData(String url) throws Exception {
    String response = HttpUtils.getString(url);

    Document parse = Jsoup.parse(response);
    Elements allElements = parse.getAllElements();

    List<CaoImg> caoImgs = new ArrayList<CaoImg>();

    for (int i = 0; i < allElements.size(); i++) {
      Element element = allElements.get(i);

      // <table class="wikitable"
      // style="width: 22em; position: absolute; top: 0px; left: 0px;">
      String nodeName = element.nodeName();
      String attrClass = element.attr("class");
      if (nodeName.equals("table") && "wikitable".equals(attrClass + "")) {
        String title = element.getElementsByAttribute("title").get(0).attr("title");
        Elements imgElement = element.getElementsByTag("img");
        String src = imgElement.attr("src");

        Elements styleElements =
            element.getElementsByAttributeValueContaining("style", "font-size");
        String otherName = null;
        String intro = null;
        if (styleElements.size() == 1) {
          intro = styleElements.get(0).text();
        } else {
          otherName = styleElements.get(0).text();
          intro = styleElements.get(1).text();
        }

        CaoImg caoImg = new CaoImg();
        caoImg.setName(title);
        caoImg.setImg(src);
        caoImg.setOtherName(otherName);
        caoImg.setIntro(intro);

        caoImgs.add(caoImg);
      }
    }

    return caoImgs;
  }
Esempio n. 12
0
    @Override
    protected Void doInBackground(Void... params) {

      try {
        Document doc = Jsoup.connect(link).ignoreContentType(true).get();
        Elements titles = doc.select("span.title2");
        title = titles.text();
        Elements image = doc.select("img.news-record-thumbnail");
        img_src = image.attr("src");
        Elements p = doc.select("div.news-block-justify").select("p");
        for (Element item : p) {
          description_text += item.text() + "\n\n";
        }
        Elements date = doc.select("span.title");
        DateAdded = date.text();
      } catch (Exception ex) {
        ex.printStackTrace();
      }
      return null;
    }
Esempio n. 13
0
  @Override
  public List<ParsedToken> collectSearchResult(Elements elements) {

    if (elements != null) {
      Elements productList = elements.select("#productList");
      String products = productList.attr("data-products");

      String[] temp1 = products.split("\\[");
      String[] temp2 = temp1[1].split("\\]");

      String[] producIdList = temp2[0].split(",");

      for (String id : producIdList) {
        String pId = "#".concat(id.trim());
        Elements productInfo = elements.select(pId);
        buildResultList(productInfo);
      }
    }

    return this.productinfoList;
  }
Esempio n. 14
0
  @Override
  public HNFeed parseDocument(Document doc) throws Exception {
    if (doc == null) return new HNFeed();

    ArrayList<HNPost> posts = new ArrayList<HNPost>();

    // clumsy, but hopefully stable query - first element retrieved is the
    // top table, we have to skip that:
    Elements tableRows = doc.select("table tr table tr");
    tableRows.remove(0);

    Elements nextPageURLElements = tableRows.select("a:matches(More)");
    String nextPageURL = null;
    if (nextPageURLElements.size() > 0)
      nextPageURL = resolveRelativeHNURL(nextPageURLElements.attr("href"));

    String url = null;
    String title = null;
    String author = null;
    int commentsCount = 0;
    int points = 0;
    String urlDomain = null;
    String postID = null;

    boolean endParsing = false;
    for (int row = 0; row < tableRows.size(); row++) {
      int rowInPost = row % 3;
      Element rowElement = tableRows.get(row);

      switch (rowInPost) {
        case 0:
          Element e1 = rowElement.select("tr > td:eq(2) > a").first();
          if (e1 == null) {
            endParsing = true;
            break;
          }

          title = e1.text();
          url = resolveRelativeHNURL(e1.attr("href"));
          urlDomain = getDomainName(url);
          break;
        case 1:
          points =
              getIntValueFollowedBySuffix(rowElement.select("tr > td:eq(1) > span").text(), " p");
          author = rowElement.select("tr > td:eq(1) > a[href*=user]").text();
          Element e2 = rowElement.select("tr > td:eq(1) > a[href*=item]").first();
          if (e2 != null) {
            commentsCount = getIntValueFollowedBySuffix(e2.text(), " c");
            if (commentsCount == BaseHTMLParser.UNDEFINED && e2.text().contains("discuss"))
              commentsCount = 0;
            postID = getStringValuePrefixedByPrefix(e2.attr("href"), "id=");
          } else commentsCount = BaseHTMLParser.UNDEFINED;

          posts.add(new HNPost(url, title, urlDomain, author, postID, commentsCount, points));
          break;
        default:
          break;
      }

      if (endParsing) break;
    }

    return new HNFeed(posts, nextPageURL);
  }
  @SuppressWarnings("unchecked")
  protected void doGet(HttpServletRequest request, HttpServletResponse response)
      throws ServletException, IOException {
    // 0.init
    String captchaURL = null;
    String captchaImage = null;
    BasicCookieStore cookieStore = new BasicCookieStore();
    // CloseableHttpClient httpClient = HttpClients.createDefault();
    CloseableHttpClient httpClient =
        HttpClients.custom().setDefaultCookieStore(cookieStore).build();

    // 1.Send Get request header to server.
    // Get the response Html page.
    System.out.println("==========Send Request to e-can server==========");
    HttpGet httpGet = new HttpGet("https://www.e-can.com.tw/reservationUNMember_online.aspx");
    httpGet.addHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
    httpGet.addHeader("Accept-Encoding", "gzip, deflate");
    httpGet.addHeader("Accept-Language", "zh-TW,zh;q=0.8,en-US;q=0.5,en;q=0.3");
    httpGet.addHeader("Connection", "Keep-Alive");
    httpGet.addHeader("Host", "www.e-can.com.tw");
    httpGet.addHeader("User-Agent", "Mozilla");
    CloseableHttpResponse resp = httpClient.execute(httpGet);
    System.out.println();

    // show server response status > GET 200 OK
    System.out.println(resp.getStatusLine());
    for (Header h : resp.getAllHeaders()) {
      System.out.println(h);
    }
    System.out.println("**********End of Headers********** \n\n");

    HttpEntity entity = resp.getEntity();
    // show entity
    // System.out.println("entity="+entity);

    String html = EntityUtils.toString(resp.getEntity());
    // show html page
    // System.out.println(html);

    // 2.Use Jsoup to parse html page.
    // Select cssString to get captchaURL and captchaKey source.
    Document htmlDoc = Jsoup.parse(html);
    Elements elementEventTarget = htmlDoc.select("#__EVENTTARGET");
    Elements elementEventArgument = htmlDoc.select("#__EVENTARGUMENT");
    Elements elementViewState = htmlDoc.select("#__VIEWSTATE");
    Elements elementViewStateGenerrator = htmlDoc.select("#__VIEWSTATEGENERATOR");
    Elements elementddlGetdate = htmlDoc.select("#ddlGetdate > option");
    // info for post later
    String __EVENTTARGET = elementEventTarget.val();
    String __EVENTARGUMENT = elementEventArgument.val();
    String __VIEWSTATE = elementViewState.val();
    String __VIEWSTATEGENERATOR = elementViewStateGenerrator.val();
    int count = 0;
    JSONObject joOption = new JSONObject();
    for (Element e : elementddlGetdate) {
      joOption.put(count++, e.toString());
    }
    System.out.println("joOption = " + joOption);

    Elements elementCaptcha = htmlDoc.select("#captcha");
    System.out.println(elementCaptcha.attr("src"));
    captchaURL = "https://www.e-can.com.tw/" + elementCaptcha.attr("src");

    // show URL
    System.out.println("captchaURL=" + captchaURL);

    // 3.Send GET request to get the captchaImage source.
    // Encode source to base64 String.
    System.out.println("==========Send request to e-can for captcha image==========");
    httpGet = new HttpGet(captchaURL);
    httpGet.addHeader("Referer", "https://www.e-can.com.tw/reservationUNMember_online.aspx");
    httpGet.addHeader("Accept", "image/png,image/*;q=0.8,*/*;q=0.5");
    httpGet.addHeader("Accept-Encoding", "gzip, deflate");
    httpGet.addHeader("Accept-Language", "zh-TW,zh;q=0.8,en-US;q=0.5,en;q=0.3");
    httpGet.addHeader("Connection", "Keep-Alive");
    httpGet.addHeader("Host", "www.e-can.com.tw");
    httpGet.addHeader("User-Agent", "Mozilla");
    resp = httpClient.execute(httpGet);
    System.out.println();

    List<Cookie> cookies = cookieStore.getCookies();
    if (cookies.isEmpty()) {
      System.out.println("None");
    } else {
      for (int i = 0; i < cookies.size(); i++) {
        System.out.println("- " + cookies.get(i).toString());
      }
    }
    System.out.println("cookieName= " + cookies.get(0).getName());
    System.out.println("cookieValue= " + cookies.get(0).getValue());
    System.out.println();

    System.out.println(resp.getStatusLine());
    for (Header h : resp.getAllHeaders()) {
      System.out.println(h);
    }
    System.out.println("**********End of Headers********** \n\n");
    entity = resp.getEntity();
    InputStream instream = entity.getContent();
    ByteArrayOutputStream baos = new ByteArrayOutputStream();
    byte[] bytes = new byte[instream.available()];
    int reads = instream.read();
    while (reads != -1) {
      baos.write(reads);
      reads = instream.read();
    }
    bytes = baos.toByteArray();

    captchaImage = "data:image/png;base64," + new BASE64Encoder().encode(bytes);
    // show captchaImage of base64 code
    // System.out.println(captchaImage);
    EntityUtils.consume(entity);

    // 4.Use Json format to wrap url, key and source string.
    // Then send response to ajax request from index.jsp
    JSONObject jo = new JSONObject();
    jo.put("__EVENTTARGET", __EVENTTARGET);
    jo.put("__EVENTARGUMENT", __EVENTARGUMENT);
    jo.put("__VIEWSTATE", __VIEWSTATE);
    jo.put("__VIEWSTATEGENERATOR", __VIEWSTATEGENERATOR);
    jo.put("captchaURL", captchaURL);
    jo.put("captchaImage", captchaImage);
    jo.put(cookies.get(0).getName(), cookies.get(0).getValue());
    String bothJson = "[" + jo + "," + joOption + "]";
    response.setContentType("application/json");
    response.setCharacterEncoding("utf-8");
    PrintWriter out = response.getWriter();
    out.print(bothJson);
    out.flush();
  }
  @Override
  public SNComments parseDocument(Document doc) throws Exception {
    SNComments comments = new SNComments();
    if (doc == null) {
      return comments;
    }
    Elements tableRows = doc.body().select("table tr table tr");
    if (tableRows != null && tableRows.size() > 0) {
      tableRows.remove(0);
      // 获取下一页链接
      Elements moreURLElements = tableRows.select("a:matches(More)");
      String moreURL = null;
      if (moreURLElements.size() > 0) {
        moreURL = resolveRelativeSNURL(moreURLElements.attr("href"));
      }
      comments.setMoreURL(moreURL);
      String linkURL = null;
      String parentURL = null;
      String discussURL = null;
      String text = null;
      String created = null;
      SNUser user = null;
      String artistTitle = null; // 文章标题
      String voteURL = null;
      for (int row = 0; row < tableRows.size(); row++) {
        int rowInPost = row % 2;
        Element rowElement = tableRows.get(row);
        if (rowInPost == 0) {
          Element textElement = rowElement.select("tr > td:eq(1) > span").first();
          if (textElement == null) {
            break;
          }
          text = textElement.text();
          user = new SNUser();

          Element spanElement = rowElement.select("tr > td:eq(1) > div > span").first();
          created = getCreateAt(spanElement.text());
          Elements aElements = spanElement.select("span > a");
          if (aElements != null && aElements.size() >= 4) {
            int size = aElements.size();
            Element anthorURLElement = aElements.first();
            user.setId(anthorURLElement.text());
            Element linkURLElement = aElements.get(1);
            linkURL = resolveRelativeSNURL(linkURLElement.attr("href"));
            Element parentURLElement = aElements.get(2);
            parentURL = resolveRelativeSNURL(parentURLElement.attr("href"));
            Element artistAElement = aElements.last();
            discussURL = resolveRelativeSNURL(artistAElement.attr("href"));
            artistTitle = artistAElement.text();
            if (size == 6) {
              // TODO edit delete
            }
          }

          Element voteAElement = rowElement.select("tr > td:eq(0) a").first();
          if (voteAElement != null) {
            // 登录用户的评论没有url
            voteURL = resolveRelativeSNURL(voteAElement.attr("href"));
          }
          comments.addComment(
              new SNComment(
                  linkURL, parentURL, discussURL, text, created, user, artistTitle, voteURL, null));
        }
      }
    }
    return comments;
  }
  /*
   * Downloads one painting
   */
  public void paintingScraper(String paintingLink) throws IOException {
    Document doc;
    try {
      doc = Jsoup.connect(paintingLink).get();
    } catch (IOException e) {
      doc = null;
      e.printStackTrace();
    }
    Elements imgContainer = doc.getElementsByClass("download");
    String imgURL = imgContainer.attr("href");
    for (int i = 0; i < imgURL.length(); i++) {
      if (Character.isWhitespace(imgURL.charAt(i))) {
        return;
      }
    }

    /*
     * The following parses the single image for
     * Title
     * Artist
     * Date
     * Culture
     * To be added to the image file properties
     */
    Elements imageInfo = doc.getElementsByClass("tombstone-container");
    String allInfo = imageInfo.select(".tombstone").text();

    String artist = null;
    int endArtist = 0;
    String date = null;
    int startDate = 0;
    int endDate = 0;
    String culture = null;
    int startCulture = 0;
    int endCulture = 0;
    /*
     * Parses the info for the Title
     */
    String title = imageInfo.select("h2").text();
    title = title.replaceAll("[^A-Za-z0-9 ]", "").trim();

    /*
     * Parses the info for the Artist
     */
    for (int i = 9; i < allInfo.length(); i++) {
      if (!allInfo.contains("Artist:")) {
        endArtist = 0;
        break;
      }
      if (allInfo.substring(8, i).contains(": ")) {
        endArtist = i - 2;
        artist = allInfo.substring(8, endArtist).trim();
        if (artist.contains("(")) {
          for (int h = 0; h < artist.length(); h++) {
            if (artist.substring(0, h).contains("(")) {
              endArtist = h - 1;
              artist = artist.substring(0, h - 1).trim();
              break;
            }
          }
        } else {
          break;
        }
      }
    }

    /*
     * parses the info for the Date
     */
    for (int j = endArtist; j < allInfo.length(); j++) {
      if (allInfo.substring(endArtist, j).contains("Date: ")) {
        startDate = j;
        for (int k = startDate; k < allInfo.length(); k++) {
          if (allInfo.substring(startDate, k).contains("Culture: ")) {
            endDate = k - 10;
            date = allInfo.substring(startDate, endDate);
            break;
          }
          if (allInfo.substring(startDate, k).contains("Medium: ")) {
            endDate = k - 9;
            date = allInfo.substring(startDate, endDate);
            break;
          }
          if (allInfo.substring(startDate, k).contains("Dimensions: ")) {
            endDate = k - 13;
            date = allInfo.substring(startDate, endDate);
            break;
          }
        }
        break;
      }
    }
    /*
     * Parses the info for the culture
     */
    for (int l = endDate; l < allInfo.length(); l++) {
      if (allInfo.substring(endDate, l).contains("Culture: ")) {
        startCulture = l;
        for (int m = startCulture; m < allInfo.length(); m++) {
          if (allInfo.substring(startCulture, m).contains("Medium")) {
            endCulture = m - 6;
            culture = allInfo.substring(startCulture, endCulture).trim();
            break;
          }
          if (allInfo.substring(startCulture, m).contains("Geography")) {
            endCulture = m - 10;
            culture = allInfo.substring(startCulture, endCulture).trim();
            break;
          }
        }
        break;
      }
    }

    if (imgURL.equals(null) || imgURL.isEmpty()) {
      System.out.println("No info!");
      System.out.println("-----------------------------------------------");
      return;
    } else {
      System.out.println("artist: " + artist);
      System.out.println("title: " + title);
      System.out.println("date: " + date);
      System.out.println("culture: " + culture);
      System.out.println("url: " + imgURL);
    }

    String artistFile = null;
    String titleFile = null;
    if (artist == null) {
      artist = "none";
      System.out.println("THIS SHOULD PRINT");
    }
    if (title == null) {
      title = "none";
    }

    artistFile = artist.replaceAll("[^A-Za-z0-9]", "");
    System.out.println("artistfIle: " + artistFile);
    titleFile = title.replaceAll("[^A-Za-z0-9]", "");
    System.out.println("titlefile: " + titleFile);

    /*
     * The following copies the file into my directory
     */

    String destinationFile = folderPath + artistFile + "-" + titleFile + ".jpg";
    System.out.println(destinationFile);
    System.out.println("-----------------------------------------------");

    if (imgURL != null) {
      URL url = new URL(imgURL);
      InputStream in = url.openStream();

      OutputStream out = new FileOutputStream(destinationFile);

      byte[] b = new byte[2048];
      int length;
      while ((length = in.read(b)) != -1) {
        out.write(b, 0, length);
      }
      in.close();
      out.close();
    }
  }
  private void setListElements(
      Element ele, Node rightListNode, Session session, String locale, Map<String, String> urlMap) {
    try {
      String ownPdfText = "";
      String pdfIcon = "";
      String pdfSize = "";
      Elements h2Ele = ele.getElementsByTag("h2");
      Elements h3Ele = ele.getElementsByTag("h3");
      Elements ulEle = ele.getElementsByTag("ul");
      String h2Text = null;
      String h3Text = null;

      // start of handling title of list component
      if (!h2Ele.isEmpty()) {
        h2Text = h2Ele.first().text();
        rightListNode.setProperty("title", h2Text);
        if (h2Ele.size() > 1) {
          sb.append(Constants.MISMATCH_IN_RIGHT_LIST_COUNT);
        }
      } else {
        sb.append(Constants.LIST_HEADING_COMPONENT_NOT_FOUND);
      }
      // end of handling title of list component

      // start of handling title of list component
      NodeIterator h3Iterator =
          rightListNode.hasNode("element_subtitle_0")
              ? rightListNode.getNodes("element_subtitle*")
              : null;
      if (h3Iterator != null) {
        if (!h3Ele.isEmpty()) {
          int eleSize = h3Ele.size();
          int nodeSize = (int) h3Iterator.getSize();
          Node h3nodeList;
          if (eleSize == nodeSize) {
            for (Element h3Itr : h3Ele) {
              h3nodeList = (Node) h3Iterator.next();
              h3Text = h3Itr.text();
              h3nodeList.setProperty("subtitle", h3Text);
            }
          }

          if (nodeSize < eleSize) {
            for (Element h3Itr : h3Ele) {
              if (h3Iterator.hasNext()) {
                h3nodeList = (Node) h3Iterator.next();
                h3Text = h3Itr.text();
                h3nodeList.setProperty("subtitle", h3Text);
              }
            }
            sb.append(
                Constants.MISMATCH_IN_LIST_ELEMENT
                    + nodeSize
                    + Constants.SPOTLIGHT_ELEMENT_COUNT
                    + eleSize);
          }
          if (nodeSize > eleSize) {
            for (Element h3Itr : h3Ele) {
              h3nodeList = (Node) h3Iterator.next();
              h3Text = h3Itr.text();
              h3nodeList.setProperty("subtitle", h3Text);
            }
            sb.append(
                Constants.LIST_ELEMENTS_COUNT_MISMATCH
                    + nodeSize
                    + Constants.SPOTLIGHT_ELEMENT_COUNT
                    + eleSize);
          }

        } else {
          sb.append(Constants.LIST_HEADING_COMPONENT_NOT_FOUND);
          log.debug("h3 text is not avalable");
        }
      } else {
        if (!h3Ele.isEmpty()) {
          log.debug("subtitle node doesnot exist but ele exist");
          sb.append(Constants.LIST_COMPONENT_NOT_FOUND);
        }
      }
      // end of handling title of list component

      // Element List
      NodeIterator ulNodeIterator =
          rightListNode.hasNode("element_list_0") ? rightListNode.getNodes("element_list*") : null;
      if (ulNodeIterator != null) {
        Node ulnodeList;
        for (Element ulItr : ulEle) {
          if (ulNodeIterator.hasNext()) {
            ulnodeList = (Node) ulNodeIterator.next();
            Elements list = ulItr.getElementsByTag("li");
            List<String> listAdd = new ArrayList<String>();
            for (Element li : list) {
              pdfIcon = "";
              pdfSize = "";
              boolean openNewWindow = false;
              // pdf content
              try {
                ownPdfText = li.ownText();
                if (StringUtils.isNotEmpty(ownPdfText)) {
                  log.debug("OWn text is:" + ownPdfText);
                  if (ownPdfText.toLowerCase().contains("pdf")
                      || ownPdfText.toLowerCase().contains("video")) {
                    pdfIcon = "pdf";
                    if (ownPdfText.toLowerCase().contains("video")) {
                      pdfIcon = "video";
                    }
                    int i = 0;
                    for (; i < ownPdfText.length(); i++) {
                      char character = ownPdfText.charAt(i);
                      boolean isDigit = Character.isDigit(character);
                      if (isDigit) {
                        break;
                      }
                    }
                    pdfSize = ownPdfText.substring(i, ownPdfText.length() - 1);
                    pdfSize = pdfSize.replace(")", "");
                    pdfSize = pdfSize.trim();
                  }
                }
              } catch (Exception e) {
                sb.append(Constants.Exception_BY_SPECIAL_CHARACTER);
                log.error("Exception : ", e);
              }

              // fix for new win icon
              Elements newwinCheck = li.select("span.newwin");
              if (!newwinCheck.isEmpty()) {
                log.debug("extra new win icon found");
                sb.append(Constants.EXTRA_ICON_FOUND_IN_LIST);
              }

              // check for the lock icon
              Elements imgInList = li.getElementsByTag("img");
              if (!imgInList.isEmpty()) {
                String altImg = imgInList.attr("alt");
                if (altImg.equals("lock_icon")) {
                  log.debug("lock icon found in the list");
                  sb.append(Constants.EXTRA_LOCK_IMG_FOUND_IN_LIST);
                }
              }

              if (!li.getElementsByTag("a").isEmpty()) {
                Element a = li.getElementsByTag("a").first();
                String aHref = a.absUrl("href");
                if (StringUtil.isBlank(aHref)) {
                  aHref = a.attr("href");
                }
                // Start extracting valid href
                log.debug("Before anchorHref" + a.absUrl("href") + "\n");
                String anchorHref = FrameworkUtils.getLocaleReference(aHref, urlMap, locale, sb);
                log.debug("after anchorHref" + anchorHref + "\n");
                // End extracting valid href
                JSONObject obj = new JSONObject();
                obj.put("linktext", a.text());
                obj.put("linkurl", anchorHref);
                obj.put("icon", pdfIcon);
                obj.put("size", pdfSize);
                obj.put("description", "");
                obj.put("openInNewWindow", openNewWindow);
                listAdd.add(obj.toString());
              }
            }
            ulnodeList.setProperty("listitems", listAdd.toArray(new String[listAdd.size()]));
          }
        }
        if (ulNodeIterator.hasNext()) {
          sb.append(Constants.MISMATCH_IN_RIGHT_LIST_COUNT);
        }
      } else {
        sb.append(Constants.NO_LIST_NODES_FOUND);
      }
      // End of Element List
    } catch (Exception e) {
      sb.append(Constants.UNABLE_TO_MIGRATE_LIST_COMPONENT);
      log.error("Exception : ", e);
    }
  }
Esempio n. 19
0
  public int crawBBWC(int ok, int fail, String url) {

    try {
      Document doc = Jsoup.connect(url).userAgent(UA).timeout(3000).get();
      Elements frame = doc.select("iframe#verticalContent");
      if (frame.size() > 0) {
        url = frame.attr("src");
      }

      doc = Jsoup.connect(url).userAgent(UA).timeout(3000).get();
      resultTitle = resultCont = "";

      // 处理图片链接
      Pattern p = Pattern.compile("issue_\\d+/articles/\\d+");
      Matcher m = p.matcher(url);
      if (m.find()) {
        String pre = "http://s4.cdn.bb.bbwc.cn/" + m.group();
        Elements imgs = doc.select("img");
        if (imgs.size() > 0) {
          for (Element img : imgs) {
            String raw = img.attr("data-src");
            raw = raw.replace("uploadfile", pre);
            img.attr("src", raw);
          }
        }
      }

      // 开始提取
      Elements eletitle = doc.select(this.title_rex),
          eleauth = null,
          elecont = doc.select(this.cont_rex),
          eleextra = null;

      if (Constant.DEBUG) FileUtils.writeFile(doc.html(), "clip");

      if (!auth_rex.equals("")) eleauth = doc.select(this.auth_rex);
      if (!extra_rex.equals("")) eleextra = doc.select(this.extra_rex);

      if (eletitle.size() > 0) {
        resultTitle = eletitle.get(0).html();
        if (elecont.size() > 0) {
          elecont = addStyleForTable(elecont);
          resultCont = elecont.get(0).html();
        }
        if (!auth_rex.equals("")) {
          if (eleauth.size() > 0) resultCont = "<p>" + eleauth.get(0).html() + "</p>" + resultCont;
        }
        if (!extra_rex.equals("")) {
          eleextra = addStyleForTable(eleextra);
          if (eleextra.size() > 0) resultCont = resultCont + eleextra.get(0).html();
        }

        return ok;
      } else {
        MLog.e("", "没有匹配到title");
        return fail;
      }

    } catch (IOException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }
    return fail;
  }
 public static String formHash(String html) {
   Document indexDoc = Jsoup.parse(html);
   Elements select = indexDoc.select("input[name=formhash]");
   return select.attr("value");
 }