public static String getCSSSource(String htmlSource) throws Exception {
    Document htmlDocument = Jsoup.parse(htmlSource);

    Elements elements = htmlDocument.select("link[type=text/css]");

    StringBuilder sb = new StringBuilder();

    for (Element element : elements) {
      String href = element.attr("href");

      if (!href.contains(PropsValues.PORTAL_URL)) {
        href = PropsValues.PORTAL_URL + href;
      }

      Connection connection = Jsoup.connect(href);

      Document document = connection.get();

      sb.append(document.text());

      sb.append("\n");
    }

    return sb.toString();
  }
Exemple #2
0
 public static String requestFee(String url) {
   boolean finish = false;
   String fee = "";
   do {
     try {
       Connection conn = Jsoup.connect(url);
       Document doc = conn.timeout(5000).get();
       if (doc.select("#block-system-main > table > tbody > tr:nth-child(2) > td:nth-child(4)")
               .size()
           > 0) {
         fee =
             getFee(
                 doc.select(
                         "#block-system-main > table > tbody > tr:nth-child(2) > td:nth-child(4)")
                     .text());
       } else {
         fee = url;
       }
       finish = true;
     } catch (IOException e) {
       System.out.println("requestFee : " + e.getMessage());
     }
   } while (!finish);
   return fee;
 }
Exemple #3
0
  public static void initMajorList(String originalUrl) {

    System.out.println("preparing majorList");

    boolean finish = false;
    do {
      try {
        majorList.clear();
        Connection conn = Jsoup.connect(originalUrl);
        Document doc = conn.timeout(10000).get();
        Elements es = doc.select("#accordion__target-3 > div.course-listing__box > a");
        for (Element e : es) { // major
          MajorForCollection major = new MajorForCollection();
          major.setLevel(LEVEL);
          major.setTitle(e.select("h3").get(0).text().trim());
          major.setType(e.select("p").get(0).text().replaceAll("-[\\s\\S]*", "").trim());
          major.setUrl(e.select("a").get(0).attr("href"));
          majorList.add(major);
        }
        ;
        finish = true;
      } catch (Exception e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
      }
    } while (!finish);

    System.out.println("majorList prepared");
    System.out.println("majorList size: " + majorList.size());
  }
Exemple #4
0
  public static Document getDocument(LocalDate date, Earning.EARNINGS_TYPE earningType)
      throws Exception {
    Thread.sleep(2000);
    String actualURLRequest = getYahooURL(earningType, date);
    Connection calConnectionObj = null;

    Response calResponse = null;
    int calRespAttempts = 0;
    int maxAttempt = 5;

    if (isWeekend(date)) {
      maxAttempt = 2;
    }

    while (calRespAttempts <= maxAttempt && calResponse == null) {
      calRespAttempts++;
      try {
        calConnectionObj = Jsoup.connect(actualURLRequest);
        calResponse = calConnectionObj.execute();
      } catch (Exception e1) {
        Thread.sleep(4000);
      }
    }

    if (calResponse != null && isValidConnection(calResponse)) {
      return calConnectionObj.get();
    } else {
      if (calResponse != null) {
        throw new Exception(
            "Invalid HTTP Status Code:" + calResponse.statusCode() + ";" + actualURLRequest);
      } else {
        throw new Exception("The connection response was null;" + actualURLRequest);
      }
    }
  }
Exemple #5
0
 public static String getType(Document doc) {
   String type = "";
   if (doc.select("#kw").size() > 0) {
     Element e = doc.select("#kw").get(0);
     StringBuilder typeURL = new StringBuilder();
     typeURL.append("http://widget.unistats.ac.uk/Widget/");
     typeURL.append(e.attr("data-institution") + "/");
     typeURL.append(e.attr("data-course") + "/");
     typeURL.append(e.attr("data-orientation") + "/");
     typeURL.append("null/");
     typeURL.append(e.attr("data-language") + "/");
     typeURL.append(e.attr("data-kismode"));
     boolean finishe = false;
     try {
       do {
         Connection tmpConn = Jsoup.connect(typeURL.toString());
         Document tmpDoc = tmpConn.timeout(10000).get();
         if (tmpDoc.select("#kisWidget > div.widgetCourse > h1").size() > 0) {
           e = tmpDoc.select("#kisWidget > div.widgetCourse > h1").get(0);
           type =
               e.text().trim().indexOf(" ") > 0
                   ? e.text().trim().substring(0, e.text().trim().indexOf(" "))
                   : e.text().trim();
         }
         finishe = true;
       } while (!finishe);
     } catch (Exception ex) {
       ex.printStackTrace();
     }
   }
   return type;
 }
 public static String extractContent(String url) {
   try {
     Connection connection = Jsoup.connect(url);
     connection.userAgent(USER_AGENT);
     connection.followRedirects(true);
     connection.timeout(GET_TIMEOUT);
     long start = System.currentTimeMillis();
     Connection.Response response = connection.execute();
     long diff = System.currentTimeMillis() - start;
     int responseCode = response.statusCode();
     if (response.statusCode() == OK) {
       String body = response.body();
       Logger.info(
           "%s retrieved, content length %d, time %s sec.",
           url, body.length(), FormatUtil.millis2Seconds(diff));
       return response.body();
     } else {
       Logger.error("%s returned %d", url, responseCode);
       return "";
     }
   } catch (IOException e) {
     Logger.error(e, "%s cannot be read.", url);
     return "";
   }
 }
 public boolean getCaptchaImgAndCookies(int times) {
   captchaCookies.clear();
   if (times > maxRecursiveTimes) return false;
   Connection con =
       JsoupUtil.getResourceCon(
           "https://www.zhihu.com/captcha.gif?r=" + System.currentTimeMillis() + "&type=login");
   Response rs = null;
   try {
     rs = con.execute();
   } catch (IOException e) {
     e.printStackTrace();
     log.info("获取验证码第" + times + "次失败");
     return getCaptchaImgAndCookies(++times);
   }
   File file = new File(EzraPoundUtil.CAPTCHA_DIR);
   try {
     FileOutputStream out = (new FileOutputStream(file));
     out.write(rs.bodyAsBytes());
   } catch (IOException e) {
     e.printStackTrace();
   }
   captchaCookies.putAll(rs.cookies());
   log.info("验证码已保存" + ",路径为:" + file.getAbsolutePath());
   log.info("验证码对应cookie为:" + captchaCookies);
   return true;
 }
  public boolean crawl(String url) { // Give it a URL and it makes an HTTP
    try {
      Connection connection = Jsoup.connect(url).userAgent(USER_AGENT);
      Document htmlDocument = connection.get();
      this.htmlDocument = htmlDocument;
      if (connection.response().statusCode() == 200) // 200 is the HTTP OK
      // status code
      {
        System.out.println("\n**Visiting** Received web page at " + url);
      }

      if (!connection.response().contentType().contains("text/html")) {
        System.out.println("**Failure** Retrieved something other that HTML");
        return false;
      }
      Elements linksOnPage = htmlDocument.select("a[href]");
      System.out.println("Found (" + linksOnPage.size() + "),links");
      for (Element link : linksOnPage) {
        this.links.add(link.absUrl("href"));
      }

      return true;

    } catch (IOException ioe) {
      // We were not successful in out http request
      // System.out.println("Error in our HTTP request "+ioe);
      return false;
    }
  }
  /**
   * Parse cn page and write in hbase
   *
   * @param symbol
   */
  public static void parseCNSymbols(String symbol) {
    if (!Hbase.getData(symbol).equals("")) {
      // System.out.println(symbol + " Exists!");
      return;
    }
    String url = "http://xueqiu.com/S/" + symbol + "/historical.csv";
    Response rs = null;
    // System.out.println(url);
    try {
      Connection con = getConnection(url, "historyHttp");
      con.header("Referer", " http://xueqiu.com/S/" + symbol);
      rs = con.execute();
      // System.out.println(rs.body());
    } catch (IOException e1) {
      if (handleError) {
        System.out.println(symbol + " http error");
        errors.add(symbol);
      } else {
        WriteError(symbol);
        System.out.println(symbol + " http error");
      }
      return;
    }

    try {
      BufferedReader reader = new BufferedReader(new StringReader(rs.body())); // 换成你的文件名
      reader.readLine(); // 第一行信息,为标题信息,不用,如果需要,注释掉
      String line = null;
      JSONArray HistoricalData = new JSONArray();
      List<JSONArray> jsonLists = new ArrayList<JSONArray>();
      while ((line = reader.readLine()) != null) {
        String item[] = line.split(","); // CSV格式文件为逗号分隔符文件,这里根据逗号切分
        // System.out.println(item[0]);
        JSONArray DailyData = new JSONArray();
        for (int i = 1; i < item.length; i++) {
          item[i] = item[i].replace("\"", "");
          DailyData.put(item[i]);
        }
        if (Double.valueOf(DailyData.getString(2)) != 0) {
          jsonLists.add(DailyData);
        }
      }
      for (int i = (jsonLists.size() - 1); i >= 0; i--) {
        HistoricalData.put(jsonLists.get(i));
      }
      Hbase.addData(symbol, type, HistoricalData.toString());
      // System.out.println(symbol + " done");
      // System.out.println(jsonLists);
    } catch (Exception e) {

      if (handleError) {
        System.out.println(symbol + " parsing error");
        errors.add(symbol);
      } else {
        WriteError(symbol);
        System.out.println(symbol + " parsing error");
      }
    }
  }
Exemple #10
0
  private <T> List<T> processTable(String url, LineProcessor<T> processor) throws Exception {
    List<T> tarifas = new ArrayList<>();
    Connection con = DomUtils.get(url);
    Document doc = con.get();
    Elements lines = doc.getElementsByTag("tbody").get(0).getElementsByTag("tr");

    for (Element line : lines) {
      tarifas.add(processor.processLine(line));
    }
    return tarifas;
  }
Exemple #11
0
 @Test
 public void test03() throws Exception {
   Connection connect = Jsoup.connect("http://www.jb51.net/article/16829.htm");
   Connection.Response execute = connect.execute();
   String body = execute.body();
   System.out.println(body);
   Document parse = Jsoup.parse(body);
   Elements elements = parse.select("[src$=.gif]");
   for (Element element : elements) {
     System.out.println(element.attr("src"));
   }
 }
  /**
   * 点赞
   *
   * @param statusId
   * @param like
   * @param cookie
   * @return
   * @throws TaskException
   */
  public LikeResultBean doLike(String statusId, boolean like, String cookie) throws TaskException {
    try {
      String url =
          like ? "http://m.weibo.cn/attitudesDeal/add" : "http://m.weibo.cn/attitudesDeal/delete";

      Map<String, String> cookieMap = new HashMap<String, String>();

      String[] cookieValues = cookie.split(";");
      for (String cookieValue : cookieValues) {
        String key = cookieValue.split("=")[0];
        String value = cookieValue.split("=")[1];

        cookieMap.put(key, value);
      }
      //            Logger.d(WeiboClientActivity.TAG, cookieMap);

      Connection connection = Jsoup.connect(url);
      connection
          .userAgent(
              "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:33.0) Gecko/20100101 Firefox/33.0")
          .referrer("http://m.weibo.cn/")
          .cookies(cookieMap)
          .data("id", statusId)
          .method(Connection.Method.POST);
      if (like) connection.data("attitude", "heart");

      String body = connection.execute().body();
      if (!TextUtils.isEmpty(body)) {
        Logger.d(TAG, body);

        if (body.indexOf("http://passport.weibo.cn/sso/crossdomain") != -1)
          throw new TaskException("-100", "未登录");
        else if (body.indexOf("<html") != -1) throw new TaskException("-100", "未登录");

        LikeResultBean likeBean = JSON.parseObject(body, LikeResultBean.class);
        if (likeBean.getOk() == 1) {
          return likeBean;
        } else if (likeBean.getOk() == -100) {
          throw new TaskException("-100", "未登录");
        } else {
          throw new TaskException("", likeBean.getMsg());
        }
      }
    } catch (Exception e) {
      if (e instanceof TaskException) throw (TaskException) e;

      e.printStackTrace();
    }

    throw new TaskException(TaskException.TaskError.timeout.toString());
  }
Exemple #13
0
 public boolean loginBySavedCookies() {
   loginCookies.clear();
   readCookies(EzraPoundUtil.LOGIN_COOKIES_DIR, loginCookies);
   Connection con = JsoupUtil.getGetCon("https://www.zhihu.com");
   Response rs = null;
   try {
     rs = con.cookies(loginCookies).execute();
   } catch (IOException e) {
     e.printStackTrace();
     log.info("携带cookie登录测试失败");
     return false;
   }
   return checkLogin(Jsoup.parse(rs.body()));
 }
  private static Response execute(
      String url, Method method, Map<String, String> cookies, Map<String, String> data) {
    Response response = null;

    Connection connection = Jsoup.connect(url);
    connection.method(method);

    connection.timeout(10000);
    connection.ignoreContentType(true);
    connection.maxBodySize(0);

    if (cookies != null) {
      connection.cookies(cookies);
    }

    if (data != null) {
      for (Entry<String, String> entry : data.entrySet()) {
        String key = entry.getKey();
        String value = entry.getValue();

        connection.data(key, value);
      }
    }

    try {
      response = connection.execute();
    } catch (IOException e) {
      e.printStackTrace();
    }

    return response;
  }
  public void download(Connection aInConnection, Collection<Image> images) throws IOException {
    aInConnection.url(url);
    Document lDocument = aInConnection.get();
    Element lMain = lDocument.getElementById("main");
    Elements lContents = lMain.getElementsByClass("content");

    if (lContents.size() == 1) {
      StringBuilder sb = new StringBuilder();
      Element lContent = lContents.first();

      collectImages(lContent, images);

      Elements lLightboxElements = lContent.getElementsByClass("lightbox");
      for (Element lLightboxElement : lLightboxElements) {
        Collection<Node> lImageNodes = extractImageNodes(lLightboxElement);

        Element lParent = lLightboxElement.parent();
        int i = lLightboxElement.siblingIndex();
        lParent.insertChildren(i, lImageNodes);
        lLightboxElement.remove();
      }

      Elements lChildElements = lContent.children();
      for (Element lChildElement : lChildElements) {
        if (lChildElement.hasClass("clear")) {
          // no more post content
          break;
        }

        if (title == null && lChildElement.tagName().equals("h1")) {
          // the first h1 header is the title
          title = lChildElement.html();
        } else {
          if (excerpt == null && lChildElement.tagName().equals("p")) {
            excerpt = lChildElement.text();
          }
          String lStr = lChildElement.toString();
          sb.append(lStr);
        }
      }

      content = sb.toString();

      Elements lDateElements = lContent.getElementsByClass("date");
      String lHunDate = lDateElements.first().html();
      date = new PostDate(lHunDate);
    } else {
      System.out.println("More than one content in main section of post page " + toString());
    }
  }
Exemple #16
0
 public boolean getXsrf(int times) {
   if (times > maxRecursiveTimes) return false;
   Connection con = JsoupUtil.getGetCon("http://www.zhihu.com");
   Response rs = null;
   try {
     rs = con.execute();
   } catch (IOException e) {
     e.printStackTrace();
     log.info("获取_xsrf第" + times + "次失败");
     return getXsrf(++times);
   }
   Document doc = Jsoup.parse(rs.body());
   xsrf = doc.select(".view.view-signin [name=\"_xsrf\"]").attr("value");
   log.info("已获得xsrf:" + xsrf);
   return true;
 }
  protected Document getDocument() {
    if (_document != null) {
      return _document;
    }

    Connection connection = Jsoup.connect(_url);

    try {
      _document = connection.post();
    } catch (IOException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }

    return _document;
  }
Exemple #18
0
  public void crawl(String url) {
    try {
      Connection connection = Jsoup.connect(url).userAgent(USER_AGENT);
      Document htmlPage = connection.get();
      this.htmlPage = htmlPage;

      System.out.println("WEB PAGE FOUND AT " + url);

      Elements linksOnPage = htmlPage.select("a[href]");
      System.out.println("FOUND (" + linksOnPage.size() + ") links");
      for (Element link : linksOnPage) {
        this.links.add(link.absUrl("href"));
      }
    } catch (IOException e) {
      System.out.println("ERROR: " + e);
    }
  }
  @Override
  protected List<GiveawayGroup> doInBackground(Void... params) {
    Log.d(TAG, "Fetching giveaways for page " + page);

    try {
      // Fetch the Giveaway page

      Connection jsoup =
          Jsoup.connect("http://www.steamgifts.com/giveaway/" + path + "/groups/search")
              .userAgent(Constants.JSOUP_USER_AGENT)
              .timeout(Constants.JSOUP_TIMEOUT);
      jsoup.data("page", Integer.toString(page));

      if (SteamGiftsUserData.getCurrent(fragment.getContext()).isLoggedIn())
        jsoup.cookie(
            "PHPSESSID", SteamGiftsUserData.getCurrent(fragment.getContext()).getSessionId());
      Document document = jsoup.get();

      SteamGiftsUserData.extract(fragment.getContext(), document);

      // Parse all rows of groups
      Elements groups = document.select(".table__row-inner-wrap");
      Log.d(TAG, "Found inner " + groups.size() + " elements");

      List<GiveawayGroup> groupList = new ArrayList<>();
      for (Element element : groups) {
        Element link = element.select(".table__column__heading").first();

        // Basic information
        String title = link.text();
        String id = link.attr("href").substring(7, 12);

        String avatar = null;
        Element avatarNode = element.select(".global__image-inner-wrap").first();
        if (avatarNode != null) avatar = Utils.extractAvatar(avatarNode.attr("style"));

        GiveawayGroup group = new GiveawayGroup(id, title, avatar);
        groupList.add(group);
      }

      return groupList;
    } catch (IOException e) {
      Log.e(TAG, "Error fetching URL", e);
      return null;
    }
  }
Exemple #20
0
  public static Map<String, String> login(
      HttpServletRequest req, HttpServletResponse res, String username, String password)
      throws Exception {
    String url = ReadProperties.getByName("login.ip") + "/login";
    Map<String, String> datas = new HashMap<String, String>();
    Map<String, String> cookies = new HashMap<String, String>();

    Connection con = Jsoup.connect(url).timeout(120000); // 获取连接
    con.header(
        "User-Agent",
        "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:29.0) Gecko/20100101 Firefox/29.0"); // 配置模拟浏览器
    Response rs;
    rs = con.execute();
    cookies = rs.cookies();
    Document doc = Jsoup.parse(rs.body()); // 转换为Dom树
    List<Element> et = doc.select("form"); // 获取form表单,可以通过查看页面源码代码得知
    for (Element e : et.get(0).getAllElements()) {
      if (e.attr("name").equals("username")) {
        e.attr("value", username); // 设置用户名
      }
      if (e.attr("name").equals("password")) {
        e.attr("value", password); // 设置用户密码
      }
      if (e.attr("name").length() > 0) { // 排除空值表单属性
        datas.put(e.attr("name"), e.attr("value"));
      }
    }
    // 设置cookie和post上面的map数据
    Response login = null;
    login = con.data(datas).cookies(cookies).method(Method.POST).execute();

    url = ReadProperties.getByName("common.ip") + req.getContextPath() + "/user/getUser";
    con =
        Jsoup.connect(url)
            .cookies(login.cookies())
            .ignoreContentType(true)
            .method(Method.GET); // 获取连接
    rs = con.execute();
    for (Entry<String, String> entry : rs.cookies().entrySet()) {
      Cookie cookie = new Cookie(entry.getKey(), entry.getValue());
      cookie.setPath(req.getContextPath() + "/");
      res.addCookie(cookie);
    }
    return JsonUtil.jsonToObject(rs.body(), Map.class);
  }
Exemple #21
0
  public boolean loginByEmailAndPwd() {
    loginCookies.clear();
    Scanner sc = new Scanner(System.in);
    getCaptchaImgAndCookies(0);
    log.info("请输入账号:");
    email = sc.nextLine();
    log.info("请输入密码");
    password = sc.nextLine();
    log.info("查看验证码并输入");
    captcha = sc.nextLine();
    Connection con = JsoupUtil.getPostCon("https://www.zhihu.com/login/email");
    Response rs = null;
    try {
      rs =
          con.data("_xsrf", xsrf)
              .data("email", email)
              .data("password", password)
              .data("remember_me", remeberMe)
              .data("captcha", captcha)
              .cookies(captchaCookies)
              .ignoreContentType(true)
              .execute();
    } catch (IOException e) {
      e.printStackTrace();
      log.info("通过账号密码登录发生异常");
      return false;
    }

    JSONObject jsonObject = new JSONObject(rs.body());
    String result = jsonObject.get("r").toString();
    log.info(EzraPoundUtil.unicode2Character(jsonObject.get("msg").toString()));

    Response rs2 = null;
    try {
      rs2 = JsoupUtil.getGetCon("https://www.zhihu.com").cookies(rs.cookies()).execute();
    } catch (IOException e) {
      e.printStackTrace();
    }
    if (checkLogin(Jsoup.parse(rs2.body()))) {
      loginCookies.putAll(rs.cookies());
      saveCookies(EzraPoundUtil.LOGIN_COOKIES_DIR, loginCookies);
      return true;
    }
    return false;
  }
Exemple #22
0
  public static void getVOSA() throws IOException {

    Document doc;

    // need http protocol
    Connection connection = Jsoup.connect(VOSA_URL);
    connection.timeout(30000);
    doc = connection.get();

    // get all links
    Elements links = doc.select("a.top_link");
    for (Element link : links) {

      // get the value from href attribute
      System.out.println("\nlink : " + link.attr("href"));
      System.out.println("text : " + link.text());
    }
  }
  /**
   * @param connection Jsoup connection object
   * @param method HTTP method
   * @return Jsoup Connection.Response object
   */
  public Connection.Response execute(Connection connection, Connection.Method method) {
    Connection.Response response;

    if (method != null) {
      connection.method(method);
    }

    try {
      System.out.println("Calling " + connection.request().url());
      if (props.getMode() == Mode.TEST) {
        return null;
      }
      response = connection.execute();
    } catch (IOException e) {
      e.printStackTrace();
      return null;
    }
    this.cookies.putAll(response.cookies());
    return response;
  }
Exemple #24
0
 public URL[] getURLs() throws Exception {
   List<URL> urls = new ArrayList<>();
   String query = term + " filetype:" + filetype;
   URL google =
       new URL(
           "http://www.google.com/search?q="
               + URLEncoder.encode(query, "UTF-8")
               + "&start="
               + start);
   Connection con = HttpConnection.connect(google);
   con.timeout(60000);
   con.userAgent("");
   Document doc = con.get();
   Elements els = doc.select("cite");
   for (Element el : els) {
     String text = el.text();
     if (!text.startsWith("http")) {
       text = "http://" + text;
     }
     URL url = new URL(text);
     urls.add(url);
   }
   return (URL[]) urls.toArray(new URL[urls.size()]);
 }
    public Document
	docGet_URL(String url)
    {
	Document doc;
	Connection conn;
	int timeout = CONNECT_TIMEOUT_INI;
	conn = null;
	try {
	    conn = Jsoup.connect(url);
	} catch (Exception _) {
	    ; // conn is null at this point
	} // end of [try]
	if (conn == null) return null;
	while (true) {
	    conn = conn.timeout(timeout);
	    try {
		return conn.get();
	    } catch (IOException _) {
		timeout = 2 * timeout;
		if (timeout > CONNECT_TIMEOUT_FIN) { _geterr(url); break; }
	    } // end of [try]
	}
	return null; // HX: max timeout reached at this point
    }
  /**
   * Parses the given feed and extracts out and parsers all linked items within the feed, using the
   * underlying ROME feed parsing library.
   *
   * @param rss A {@link Content} object representing the feed that is being parsed by this {@link
   *     Parser}.
   * @return A {@link ParseResult} containing all {@link Parse}d feeds that were present in the feed
   *     file that this {@link Parser} dealt with.
   */
  public String getParse(String url, boolean json, HashMap<String, Object> params)
      throws Exception {

    url = URLDecoder.decode(url, "UTF-8");
    URL feedURL = new URL(url);
    // Logger.getLogger(this.getClass().getName()).log(Level.INFO, "Encoding del Feed: {0}", new
    // Object[]{feedURL.openConnection().getContentEncoding()});
    Feed feed = FeedParser.parse(feedURL);

    // List<PostType> newsList = new ArrayList<PostType>();

    PostType newEntry;
    // PostType newEntryComments;
    Post newEntrySolr;
    // SyndFeed feed = null;

    Gson gson = new Gson();

    List<LinkType> links;

    Document doc;

    FeedSelectors feedSelectors;

    String extendedString = (String) params.get("zone");
    Place place = null;
    if (params.containsKey("place")) {
      place = placeDao.retrieveByExtendedString(extendedString);
    }
    org.zonales.tagsAndZones.objects.Zone zone = zoneDao.retrieveByExtendedString(extendedString);

    if (!json) {
      for (int i = 0; i < feed.getItemCount(); i++) {
        FeedItem entry = feed.getItem(i);
        Logger.getLogger(this.getClass().getName())
            .log(
                Level.INFO, "Intentando conectar a {0}", new Object[] {entry.getLink().toString()});

        Connection conn = Jsoup.connect(entry.getLink().toString());
        conn.timeout(60000);
        doc = conn.get();
        String responseURL = conn.response().url().getHost();
        //                doc = Jsoup.connect(entry.getLink().toString()).timeout(60000).get();
        Logger.getLogger(this.getClass().getName())
            .log(Level.INFO, "Parseando la URL: {0}", new Object[] {entry.getLink().toString()});
        feedSelectors = dao.retrieve(url);
        if (findWords(
            entry.getTitle(),
            doc,
            (ArrayList) params.get("searchlist"),
            (ArrayList) params.get("blacklist"),
            feedSelectors)) {
          newEntry = new PostType();
          String source;
          if (feed.getHeader() == null || feed.getHeader().getLink() == null) {
            Logger.getLogger(this.getClass().getName()).log(Level.INFO, "NULL: Link");
            source = feedURL.getHost();
          } else {
            Logger.getLogger(this.getClass().getName())
                .log(Level.INFO, "NO NULL: {0}", feed.getHeader().getLink().toString());
            source = feed.getHeader().getLink().getHost();
            //                        if (source.indexOf("/") != -1) {
            //                            source = source.substring(0, source.indexOf("/") + 1);
            //                        }
          }
          newEntry.setSource(source);
          newEntry.setDocType("post");
          newEntry.setZone(
              new Zone(
                  String.valueOf(zone.getId()),
                  zone.getName(),
                  zone.getType().getName(),
                  zone.getExtendedString()));

          newEntry.setPostLatitude(Double.parseDouble((String) params.get("latitud")));
          newEntry.setPostLongitude(Double.parseDouble((String) params.get("longitud")));
          // newEntry.setId(entry.getUri());
          // newEntry.setId(entry.getUri() != null && entry.getUri().length() > 0 ?
          // entry.getUri().trim() : entry.getLink().trim()+entry.getTitle().trim());
          newEntry.setId(
              entry.getGUID() != null
                  ? entry.getGUID()
                  : (entry.getElementValue("http://www.w3.org/2005/Atom", "id") != null
                      ? entry.getElementValue("http://www.w3.org/2005/Atom", "id")
                      : (entry.getTitle())));
          newEntry.setFromUser(
              new User(
                  null,
                  source,
                  null,
                  null,
                  place != null
                      ? new org.zonales.entities.Place(
                          String.valueOf(place.getId()), place.getName(), place.getType().getName())
                      : null));
          newEntry.setTitle(entry.getTitle());
          newEntry.setText(entry.getDescriptionAsText());
          newEntry.setTags(new TagsType((ArrayList) params.get("tagslist")));

          if (newEntry.getLinks() == null) {
            newEntry.setLinks(new LinksType(new ArrayList<LinkType>()));
          }
          if ((links = getLinks(feedSelectors, doc, responseURL)) != null) {
            newEntry.getLinks().getLink().addAll(links);
          }
          newEntry.getLinks().getLink().add(new LinkType("source", entry.getLink().toString()));

          if (newEntry.getActions() == null) {
            newEntry.setActions(new ActionsType(new ArrayList<ActionType>()));
          }
          newEntry.setActions(
              new ActionsType(
                  getActions(
                      feedSelectors,
                      doc,
                      newEntry.getId(),
                      json,
                      (Boolean) params.get("comments"),
                      source)));

          if (entry.getPubDate() != null) {
            newEntry.setCreated(String.valueOf(entry.getPubDate().getTime()));
          }

          if (entry.getModDate() != null) {
            newEntry.setModified(String.valueOf(entry.getModDate().getTime()));
          }

          for (ActionType action : newEntry.getActions().getAction()) {
            if ("comments".equals(action.getType())) {
              newEntry.setRelevance(action.getCant());
            }
          }

          if (!json) {
            newEntry.setVerbatim(gson.toJson(newEntry));
          }

          newsList.add(newEntry);

          // addToMap(parseResult, feed, feedLink, entry, content, newEntry);
        }
      }

      PostsType news;

      news = new PostsType(newsList);
      completeLinks(news);
      Feed2XML(news, sw);
      return sw.toString(); // + comments.toString();
    } else {
      for (int i = 0; i < feed.getItemCount(); i++) {
        FeedItem entry = feed.getItem(i);
        Logger.getLogger(this.getClass().getName())
            .log(
                Level.INFO, "Intentando conectar a {0}", new Object[] {entry.getLink().toString()});

        Connection conn = Jsoup.connect(entry.getLink().toString());
        conn.timeout(60000);
        doc = conn.get();
        String responseURL = conn.response().url().getHost();
        //                Logger.getLogger(this.getClass().getName()).log(Level.INFO, "RESPONSE URL:
        // {0}", responseURL);
        //                doc = Jsoup.connect(entry.getLink().toString()).timeout(60000).get();
        Logger.getLogger(this.getClass().getName())
            .log(Level.INFO, "Parseando la URL: {0}", new Object[] {entry.getLink().toString()});
        feedSelectors = dao.retrieve(url);
        if (findWords(
            entry.getTitle(),
            doc,
            (ArrayList) params.get("searchlist"),
            (ArrayList) params.get("blacklist"),
            feedSelectors)) {
          newEntrySolr = new Post();
          String source;
          if (feed.getHeader() == null || feed.getHeader().getLink() == null) {
            Logger.getLogger(this.getClass().getName()).log(Level.INFO, "NULL: Link");
            source = feedURL.getHost();
          } else {
            Logger.getLogger(this.getClass().getName())
                .log(Level.INFO, "NO NULL: {0}", feed.getHeader().getLink().toString());
            source = feed.getHeader().getLink().getHost();
            //                        if (source.indexOf("/") != -1) {
            //                            source = source.substring(0, source.indexOf("/") + 1);
            //                        }
          }
          newEntrySolr.setSource(source);
          newEntrySolr.setDocType("post");
          newEntrySolr.setZone(
              new Zone(
                  String.valueOf(zone.getId()),
                  zone.getName(),
                  zone.getType().getName(),
                  zone.getExtendedString()));

          newEntrySolr.setPostLatitude(Double.parseDouble((String) params.get("latitud")));
          newEntrySolr.setPostLongitude(Double.parseDouble((String) params.get("longitud")));
          // newEntry.setId(entry.getUri());
          // newEntry.setId(entry.getUri() != null && entry.getUri().length() > 0 ?
          // entry.getUri().trim() : entry.getLink().trim()+entry.getTitle().trim());
          newEntrySolr.setId(
              entry.getGUID() != null
                  ? entry.getGUID()
                  : (entry.getElementValue("http://www.w3.org/2005/Atom", "id") != null
                      ? entry.getElementValue("http://www.w3.org/2005/Atom", "id")
                      : (entry.getTitle())));
          newEntrySolr.setFromUser(
              new User(
                  null,
                  source,
                  null,
                  null,
                  place != null
                      ? new org.zonales.entities.Place(
                          String.valueOf(place.getId()), place.getName(), place.getType().getName())
                      : null));
          newEntrySolr.setTitle(entry.getTitle());
          newEntrySolr.setText(entry.getDescriptionAsText());
          newEntrySolr.setTags(new ArrayList<String>((ArrayList) params.get("tagslist")));

          if (newEntrySolr.getLinks() == null) {
            newEntrySolr.setLinks(new ArrayList<LinkType>());
          }
          if ((links = getLinks(feedSelectors, doc, responseURL)) != null) {
            newEntrySolr.getLinks().addAll(links);
          }
          newEntrySolr.getLinks().add(new LinkType("source", entry.getLink().toString()));

          if (newEntrySolr.getActions() == null) {
            newEntrySolr.setActions(new ArrayList<ActionType>());
          }
          newEntrySolr
              .getActions()
              .addAll(
                  getActions(
                      feedSelectors,
                      doc,
                      newEntrySolr.getId(),
                      json,
                      (Boolean) params.get("comments"),
                      source));

          if (entry.getPubDate() != null) {
            newEntrySolr.setCreated((entry.getPubDate().getTime()));
          }
          if (entry.getModDate() != null) {
            newEntrySolr.setModified((entry.getModDate().getTime()));
          }

          for (ActionType action : newEntrySolr.getActions()) {
            if ("comments".equals(action.getType())) {
              newEntrySolr.setRelevance(action.getCant());
            }
          }

          if (!json) {
            newEntrySolr.setVerbatim(gson.toJson(newEntrySolr));
          }

          newsListSolr.add(newEntrySolr);

          // addToMap(parseResult, feed, feedLink, entry, content, newEntry);
        }
      }
      return "{post: " + gson.toJson(newsListSolr) + "}"; // + comments.toString();
    }
  }
 @Override
 public void addExtraParameters(Connection connection) {
   connection.data("code", giveawayId);
 }
Exemple #28
0
  public static void getDetails(MajorForCollection major) throws Exception {
    Connection conn = Jsoup.connect(major.getUrl());
    Document doc = conn.timeout(10000).followRedirects(true).get();
    Element e = null;

    if (doc.select("table.course-page__table-basic").size() > 0) {
      e = doc.select("table.course-page__table-basic").get(0);
      for (Element tr : e.select("tr")) {
        if (tr.text().contains("Duration")) {
          major.setLength(getLength(e.text()));
        } else if (tr.text().contains("Start date")) {
          major.setMonthOfEntry(getMonthOfEntry(e.text()));
        }
      }
    }

    if (doc.select("a.btn.btn-bordered").size() > 0) {
      e = doc.select("a.btn.btn-bordered").get(0);
      major.setApplicationFee(e.attr("href"));
    }

    if (doc.select("#entry-requirements-2").size() > 0) {
      e = doc.select("#entry-requirements-2").get(0);
      major.setAcademicRequirements(e.text());
    }

    if (doc.select("div.course-page.row a").size() > 0) {
      e = doc.select("div.course-page.row a").last();
      major.setSchool(e.attr("href"));
      if (major
          .getSchool()
          .equals("http://www.study.monash/media/links/faculty-websites/design-and-architecture")) {
        major.setSchool("Monash Art Design & Architecture");
      } else if (major
          .getSchool()
          .equals("http://www.study.monash/media/links/faculty-websites/business-and-economics")) {
        major.setSchool("Monash Business School");
      } else if (major
          .getSchool()
          .equals("http://www.study.monash/media/links/faculty-websites/arts")) {
        major.setSchool("Faculty of Arts, Monash University");
      } else if (major
          .getSchool()
          .equals("http://www.study.monash/media/links/faculty-websites/science")) {
        major.setSchool("Faculty of Science");
      } else if (major
          .getSchool()
          .equals("http://www.study.monash/media/links/faculty-websites/medicine")) {
        major.setSchool("Faculty of Medicine, Nursing and Health Sciences");
      } else if (major
          .getSchool()
          .equals("http://www.study.monash/media/links/faculty-websites/education")) {
        major.setSchool("Faculty of Education - Faculty of Education");
      } else if (major
          .getSchool()
          .equals("http://www.study.monash/media/links/faculty-websites/engineering")) {
        major.setSchool("Faculty of Engineering, Monash University");
      } else if (major
          .getSchool()
          .equals("http://www.study.monash/media/links/faculty-websites/information-technology")) {
        major.setSchool("Faculty of Information Technology - Monash University");
      } else if (major
          .getSchool()
          .equals("http://www.study.monash/media/links/faculty-websites/pharmacy")) {
        major.setSchool("Faculty of Pharmacy and Pharmaceutical Sciences");
      } else if (major
          .getSchool()
          .equals("http://www.study.monash/media/links/faculty-websites/law")) {
        major.setSchool("Faculty of Law");
      }
    }

    if (doc.select("#fees").size() > 0) {
      e = doc.select("#fees").get(0);
      major.setTuitionFee(e.nextElementSibling().text());
    }

    if (!major
            .getApplicationFee()
            .equals("http://www.monash.edu.au/pubs/handbooks/courses/A6015.html")
        && !major
            .getApplicationFee()
            .equals("http://www.monash.edu.au/pubs/handbooks/courses/2276.html")) {
      doc = WebUtils.getDocument(major.getApplicationFee(), WebUtils.METHOD_GET, 10 * 1000);
      if (doc.select("h2.black.pub_heading:containsOwn(Requirements) + div.pub_body_text").size()
          > 0) {
        e = doc.select("h2.black.pub_heading:containsOwn(Requirements) + div.pub_body_text").get(0);
        major.setStructure(replaceSpecialCharacter(html2Str(e.outerHtml())).trim());
        if (major.getStructure().contains("Part A.")) {
          major.setStructure(
              major.getStructure().substring(major.getStructure().indexOf("Part A.")));
        } else if (doc.select("h2.black.pub_heading:containsOwn(Structure) + div.pub_body_text")
                .size()
            > 0) {
          e = doc.select("h2.black.pub_heading:containsOwn(Structure) + div.pub_body_text").get(0);
          major.setStructure(replaceSpecialCharacter(html2Str(e.text())).trim());
        }
      }
    }

    mark(major, true);
  }
  @Override
  public void execute(MessageEvent event, String[] args) {
    User sender = event.getUser();
    Channel channel = event.getChannel();

    if (args.length > 0) {
      String plugin = args[0].toLowerCase();
      String url =
          String.format(
              "http://api.bukget.org/3/search/plugin_name/like/%s%s",
              plugin, (args.length) == 1 ? "" : ("?size=" + args[1]));

      Connection conn =
          Jsoup.connect(url).timeout(500).followRedirects(true).ignoreContentType(true);
      String json;

      try {
        json = conn.get().text();
      } catch (IOException ex) {
        foxbot.log(ex);
        channel
            .send()
            .message(
                Utils.colourise(
                    String.format(
                        "(%s) &cAn error occurred while querying the Bukget API!",
                        Utils.munge(sender.getNick()))));
        return;
      }

      if (json.equals("[]")) {
        channel
            .send()
            .message(
                Utils.colourise(
                    String.format("(%s) &cNo results found!", Utils.munge(sender.getNick()))));
        return;
      }

      JSONArray jsonArray = new JSONArray(json);
      JSONObject found = null;

      for (int i = 0; i < jsonArray.length(); i++) {
        JSONObject jsonObject = jsonArray.getJSONObject(i);
        String name = jsonObject.getString("plugin_name");

        if (name.equalsIgnoreCase(plugin)) {
          found = jsonObject;
          break;
        }
      }

      if (found == null) {
        found = jsonArray.getJSONObject(0);
      }

      String name = found.getString("plugin_name");
      String description = found.getString("description");
      String pluginUrl =
          String.format("http://dev.bukkit.org/bukkit-plugins/%s/", found.getString("slug"));

      if (description.isEmpty()) {
        description = "No description";
      }

      channel
          .send()
          .message(
              Utils.colourise(
                  String.format(
                      "(%s) &2Name:&r %s &2Description:&r %s &2URL:&r %s",
                      Utils.munge(sender.getNick()), name, description, pluginUrl)));
      return;
    }
    foxbot.sendNotice(
        sender,
        String.format(
            "Wrong number of args! Use %sbukkitsearch <plugin>",
            foxbot.getConfig().getCommandPrefix()));
  }
Exemple #30
0
  @Override
  protected String doProcess(File htmlfile, String originalUrl, Intent intent) {
    try {
      //            String charset = "utf-8";
      Connection coon = HttpConnection.connect(originalUrl);
      coon.followRedirects(
          false); // we don't want it be redirected to other page,example: 10.254.7.4
      Document doc = coon.get();
      Element head = doc.head();
      Element body = doc.body();
      if (body.children().size() == 0) {
        Log.e(TAG, "body has no child with url=" + originalUrl);
        return PROCESS_FAILED_URL;
      }
      /*
      Elements meta = head.select("meta");
      if(!meta.isEmpty()){
          Element m = meta.get(0);
          String content = m.attr("content");
          String attr = content.substring(content.indexOf("charset=")+8);
          if(!attr.trim().isEmpty()){
              charset = attr;
          }
      }
      */
      Elements base = head.select("base");
      if (base.isEmpty()) {
        String b = head.baseUri();
        Attributes attrs = new Attributes();
        attrs.put("href", b);
        ArrayList<Element> a = new ArrayList<>();
        a.add(new Element(Tag.valueOf("base"), b, attrs));
        head.insertChildren(0, a);
      }

      Element div = doc.select("div.content-main").first();
      if (div == null) {
        Log.e(TAG, "not found specific element with url=" + originalUrl);
        return PROCESS_FAILED_URL;
      }
      Element title = div.select("h1.title").first();
      title.remove();
      body.empty();
      ArrayList<Element> a = new ArrayList<>();
      a.add(div);
      body.insertChildren(0, a);
      int g = 0;
      while (g < 2) { // try two times.
        if (FileUtil.saveStringToFile(doc.toString(), htmlfile, false)) {
          break;
        }
        g++;
      }

      if (g < 2) return StringUtils.file2Url(htmlfile, PROCESS_FAILED_URL);
      Log.e(TAG, "save html to file failed with url=" + originalUrl);
    } catch (MalformedURLException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    } catch (IOException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }
    return PROCESS_FAILED_URL;
  }