示例#1
1
 public void scrapBlogPost(String href, BlogPost ourCopy, boolean commentsOnly, String marker)
     throws IOException {
   Document document;
   try {
     document = Jsoup.connect(href).get();
   } catch (IOException e) {
     try {
       Thread.sleep(2000);
     } catch (InterruptedException e1) {
       // TODO Auto-generated catch block
       e1.printStackTrace();
     }
     document = Jsoup.connect(href).get();
   }
   if (ourCopy == null) ourCopy = new BlogPost();
   BlogPost post = parser.parseBlogPost(document, ourCopy);
   List<BlogComment> comments = parser.parseBlogComments(document);
   if (comments != null && !comments.isEmpty()) {
     post.setComments(comments);
     post.setCommentCount(comments.size());
   } else {
     post.setCommentCount(0);
   }
   post.setUrl(href);
   post.setMarker(marker);
   repository.save(post);
 }
 @Override
 public void populateMetaData(MetaData metaData) throws MetaDataException {
   Document doc;
   try {
     if (method.equals("GET")) {
       doc = Jsoup.connect(url).get();
     } else if (method.equals("POST")) {
       doc = Jsoup.connect(url).data(requestData).post();
     } else {
       throw new MetaDataException("Unsupported HTML access method: " + method);
     }
     for (MetaDataAttribute attribute : attributes) {
       Elements elements = doc.select(attribute.getQuery());
       if (elements.size() > 0) {
         String sValue = elements.get(0).text();
         Object oValue = attribute.getValueMapper().parse(sValue);
         metaData.put(attribute.getName(), oValue);
       }
     }
   } catch (IOException e) {
     throw new MetaDataException(e);
   } catch (ValueMapperException e) {
     throw new MetaDataException(e);
   }
 }
  /*
   * I haven't found a direct way of extracting the download URL of a Mixcloud track.
   * Mixcloud's track preview URLs and full download URLs are similar. The preview URL for
   * a Mixcloud track is simple to extract.
   *
   * This method replaces the "previews" part of the preview URL with "cloudcasts/originals" and then
   * cycles through all of Mixcloud's stream servers until the download URL is found.
   *
   * Similarity between Mixcloud preview URL and full download URL:
   * http://stream8.mxcdn.com/previews/9/6/a/e/93a8-2d77-4573-85c5-68bfb679d9bc.mp3 - preview URL
   * http://stream11.mxcdn.com/cloudcasts/originals/9/6/a/e/93a8-2d77-4573-85c5-68bfb679d9bc.mp3 - download URL
   */
  private String generateStreamURL() throws IOException {
    String downloadUrl = this.getPreviewURL().replaceAll("previews", "cloudcasts/originals");

    try {
      @SuppressWarnings("unused")
      Response res =
          Jsoup.connect(downloadUrl)
              .ignoreContentType(true)
              .userAgent("Mozilla/5.0 (Windows NT 6.1; rv:17.0) Gecko/20100101 Firefox/17.0")
              .execute();
      return downloadUrl;
    } catch (HttpStatusException firstAttempt) {
      int serversToCycle = 30;
      for (int i = 1; i <= serversToCycle; ) {
        try {
          String cycledUrl = downloadUrl.replaceAll("stream[0-9]+", ("stream" + i));

          Response res =
              Jsoup.connect(cycledUrl)
                  .ignoreContentType(true)
                  .userAgent("Mozilla/5.0 (Windows NT 6.1; rv:17.0) Gecko/20100101 Firefox/17.0")
                  .execute();
          if (res.parse().toString().length() < 2000) i++;
          else return cycledUrl;
        } catch (HttpStatusException cycledAttempt) {
          i++;
        }
      }
    }

    return null;
  }
示例#4
0
 private Response postToLogin(String username, String password, String[] captchaData)
     throws ConnectionException {
   try {
     Map<String, String> data = new HashMap<>();
     Document loginDocument = Jsoup.connect(Endpoints.LOGIN_URL.url()).get();
     Element loginForm = loginDocument.getElementById("loginForm");
     for (Element input : loginForm.getElementsByTag("input")) {
       data.put(input.attr("name"), input.attr("value"));
     }
     Date now = new Date();
     data.put("timezone_field", new SimpleDateFormat("XXX").format(now).replace(':', '|'));
     data.put("username", username);
     data.put("password", password);
     data.put("js_time", String.valueOf(now.getTime() / 1000));
     if (captchaData.length > 0) {
       data.put("hip_solution", captchaData[0]);
       data.put("hip_token", captchaData[1]);
       data.put("fid", captchaData[2]);
       data.put("hip_type", "visual");
       data.put("captcha_provider", "Hip");
     } else {
       data.remove("hip_solution");
       data.remove("hip_token");
       data.remove("fid");
       data.remove("hip_type");
       data.remove("captcha_provider");
     }
     return Jsoup.connect(Endpoints.LOGIN_URL.url()).data(data).method(Method.POST).execute();
   } catch (IOException e) {
     throw ExceptionHandler.generateException("While submitting credentials", e);
   }
 }
示例#5
0
    @Override
    protected String doInBackground(Void... params) {
      String html = "";
      try {
        Connection.Response loginForm;
        loginForm =
            Jsoup.connect("https://ta.yrdsb.ca/yrdsb/").method(Connection.Method.GET).execute();

        // Login to page using user/pass entered in MainActivity
        Document document =
            Jsoup.connect("https://ta.yrdsb.ca/yrdsb/")
                .data("cookieexists", "false")
                .data("username", MainActivity.usernameString)
                .data("password", MainActivity.passwordString)
                .data("submit", "Login")
                .cookies(loginForm.cookies())
                .post();

        // Convert document into string for easier processing
        html = document.toString();

        Document doc = Jsoup.parse(html);
        // for (int x = 0; x < doc.select("[width=85%], [border=0], [cellspacing=0],
        // [cellpadding=5]").size(); x++){
        System.out.println(
            doc.select("[width=85%], [border=0], [cellspacing=0], [cellpadding=5]")
                .select("tr")
                .size());
        // System.out.println(doc.select("[width=85%], [border=0], [cellspacing=0],
        // [cellpadding=5]").select("tr").get(5));
        // }

        // Prepare array to store grades
        grades = new ArrayList<>();
        courses = new ArrayList<>();

        // Regex to search html string for grades, then add to array
        Pattern p = Pattern.compile("current mark\\s?=\\s?(\\d+\\.?\\d*)");
        Matcher m = p.matcher(html);
        while (m.find()) {
          grades.add(new Double(m.group(1)));
        }

        Pattern p1 = Pattern.compile("([a-zA-Z]{3}[0-9]{1}[a-zA-Z]{1}[0-9]{1})");
        Matcher m1 = p1.matcher(html);
        while (m1.find()) {
          courses.add(new String(m1.group(1)));
        }

      } catch (IOException e) {
        e.printStackTrace();
      }
      return html;
    }
示例#6
0
 public static final String getAllTopicArtileUrl(String url) throws IOException {
   String topic = null;
   Document doc = Jsoup.connect(url).get();
   Elements blocks = doc.select("a");
   topic = "http://bbs.nju.edu.cn/" + blocks.get(blocks.size() - 1).attr("href");
   String nextContent = Jsoup.connect(topic).get().toString();
   topic =
       nextContent.substring(nextContent.indexOf("url=") + 4, nextContent.indexOf(".A\" />") + 2);
   topic = "http://bbs.nju.edu.cn/" + topic.replace("amp;", "");
   return topic;
 }
    @Override
    public void run() {
      // TODO Auto-generated method stub
      Document doc = null;
      Elements eles = null;
      if (!Utils.isNET(NewsContentActivity.this)) {
        Utils.showToast(NewsContentActivity.this, "网络不可用哦,亲!", Toast.LENGTH_SHORT);
      } else {
        try {
          doc = Jsoup.connect(url).timeout(8000).get();
          if (null == doc) {
            Utils.showToast(NewsContentActivity.this, "网络不给力哦,亲,请返回再进入吧!", Toast.LENGTH_SHORT);
            return;
          }
          eles = doc.select("#Cnt-Main-Article-QQ P");
          StringBuilder sb = new StringBuilder();
          for (int i = 0; i < eles.size(); i++) {
            sb.append(eles.get(i).outerHtml());
          }
          Message msg = new Message();
          Bundle bundle = new Bundle();
          bundle.putString("content", sb.toString());
          Log.i("content", sb.toString());
          msg.setData(bundle);
          msg.what = NewsContentActivity.NEWCONTENTRECEIVED;
          myHandler.sendMessage(msg);

        } catch (IOException e) {
          // TODO Auto-generated catch block
          e.printStackTrace();
        }
      }
    }
  @Override
  public SearchResult[] getSearchResults(String searchString) throws IOException {
    Document doc = Jsoup.connect(searchString).timeout(CONNECTION_TIMEOUT_VALUE).get();
    boolean onSearchResultsPage = doc.location().contains("adultSearch.htm");
    // found the movie without a search results page
    if (doc.location() != null && !onSearchResultsPage) {
      String idOfPage = getIDStringFromDocumentLocation(doc);
      String posterPath = getPosterPreviewPathFromIDString(idOfPage);
      String label = doc.select("title").first().text();
      Thumb previewImage = new Thumb(posterPath);
      // SearchResult directResult = new SearchResult(doc.location());
      SearchResult result = null;
      if (posterPath != null) result = new SearchResult(doc.location(), label, previewImage);
      else result = new SearchResult(doc.location(), label, null);

      SearchResult[] directResultArray = {result};
      return directResultArray;
    }
    Elements foundMovies = doc.select("table[width=690]:contains(Wish List) tr tbody:has(img)");
    LinkedList<SearchResult> searchList = new LinkedList<SearchResult>();

    for (Element movie : foundMovies) {
      String urlPath = movie.select("a").first().attr("href");
      String thumb = movie.select("img").first().attr("src");
      String label = movie.select("img").first().attr("alt");
      SearchResult searchResult = new SearchResult(urlPath, label, new Thumb(thumb));
      if (!searchList.contains(searchResult)) searchList.add(searchResult);
    }
    return searchList.toArray(new SearchResult[searchList.size()]);
  }
示例#9
0
 private static String getTrailer(Movie movie) {
   String trailerLink = "";
   if (Integer.valueOf(movie.getMovieYear()) < 1990) {
     trailerLink = "null";
   } else {
     trailerLink += "http://www.youtube.com";
     String link = formatYoutubeString(movie.getMovieName());
     try {
       Document d = Jsoup.connect("http://www.youtube.com/" + link).get();
       Element e = d.body();
       String html = e.toString();
       String linkDiv = "";
       int max = html.indexOf("class=\"yt-lockup-title \"><a href=\"") + 100;
       for (int i = html.indexOf("class=\"yt-lockup-title \"><a href=\""); i < max; i++) {
         linkDiv += html.charAt(i);
       }
       for (int i = linkDiv.indexOf("<a href=\"") + 9;
           i < linkDiv.indexOf("class=\"yt-uix-sessionlink") - 2;
           i++) {
         trailerLink += linkDiv.charAt(i);
       }
     } catch (Exception e) {
       System.out.println(e.toString());
     }
   }
   return trailerLink;
 }
示例#10
0
 /**
  * getMovieActors parses through the movie's page html and returns three actors.
  *
  * @author defq0n
  * @param pageLink is the extended imdb url for the movie page.
  * @return movieActors String containing three actors.
  */
 private static String[] getMovieActors(String pageLink) {
   String[] movieActors = {"", "", ""};
   try {
     Document d = Jsoup.connect("http://imdb.com" + pageLink).get();
     Element e = d.body();
     String html = e.toString();
     String actorsDiv = "";
     for (int i = html.indexOf("<h4 class=\"inline\">Stars:</h4>") + 30;
         i < html.indexOf("See full cast and crew");
         i++) {
       actorsDiv += html.charAt(i);
     }
     String tempDiv = actorsDiv;
     for (int i = 0; i < 3; i++) { // we will get the first three top actors
       String actor = "";
       String t = "itemprop=\"url\"><span class=\"itemprop\" itemprop=\"name\">";
       for (int j = tempDiv.indexOf(t) + t.length(); j < tempDiv.indexOf("</span></a>"); j++) {
         actor += tempDiv.charAt(j);
       }
       movieActors[i] = actor;
       tempDiv = "";
       for (int j = actorsDiv.indexOf(actor + "</span>") + actor.length() + 7;
           j < actorsDiv.length();
           j++) {
         tempDiv += actorsDiv.charAt(j);
       }
     }
   } catch (Exception e) {
     System.out.println(e.toString());
   }
   return movieActors;
 }
示例#11
0
 @Override
 public void rip() throws IOException {
   logger.info("    Retrieving " + this.url.toExternalForm());
   Document doc = Jsoup.connect(this.url.toExternalForm()).userAgent(USER_AGENT).get();
   Pattern p = Pattern.compile("^.*var qualityArr = (.*});.*$", Pattern.DOTALL);
   Matcher m = p.matcher(doc.html());
   if (m.matches()) {
     try {
       JSONObject json = new JSONObject(m.group(1));
       String vidUrl = null;
       for (String quality : new String[] {"1080p", "720p", "480p", "240p"}) {
         if (json.has(quality)) {
           vidUrl = json.getString(quality);
           break;
         }
       }
       if (vidUrl == null) {
         throw new IOException("Unable to find video URL at " + this.url);
       }
       addURLToDownload(new URL(vidUrl), HOST + "_" + getGID(this.url));
       waitForThreads();
       return;
     } catch (JSONException e) {
       logger.error("Error while parsing JSON at " + url, e);
       throw e;
     }
   }
   throw new IOException("Failed to rip video at " + this.url);
 }
示例#12
0
 public static String requestFee(String url) {
   boolean finish = false;
   String fee = "";
   do {
     try {
       Connection conn = Jsoup.connect(url);
       Document doc = conn.timeout(5000).get();
       if (doc.select("#block-system-main > table > tbody > tr:nth-child(2) > td:nth-child(4)")
               .size()
           > 0) {
         fee =
             getFee(
                 doc.select(
                         "#block-system-main > table > tbody > tr:nth-child(2) > td:nth-child(4)")
                     .text());
       } else {
         fee = url;
       }
       finish = true;
     } catch (IOException e) {
       System.out.println("requestFee : " + e.getMessage());
     }
   } while (!finish);
   return fee;
 }
示例#13
0
  public static void initMajorList(String originalUrl) {

    System.out.println("preparing majorList");

    boolean finish = false;
    do {
      try {
        majorList.clear();
        Connection conn = Jsoup.connect(originalUrl);
        Document doc = conn.timeout(10000).get();
        Elements es = doc.select("#accordion__target-3 > div.course-listing__box > a");
        for (Element e : es) { // major
          MajorForCollection major = new MajorForCollection();
          major.setLevel(LEVEL);
          major.setTitle(e.select("h3").get(0).text().trim());
          major.setType(e.select("p").get(0).text().replaceAll("-[\\s\\S]*", "").trim());
          major.setUrl(e.select("a").get(0).attr("href"));
          majorList.add(major);
        }
        ;
        finish = true;
      } catch (Exception e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
      }
    } while (!finish);

    System.out.println("majorList prepared");
    System.out.println("majorList size: " + majorList.size());
  }
示例#14
0
 @Override
 public void generateSessionId() {
   LOGGER.info("login to DMM");
   try {
     Connection.Response res =
         Jsoup.connect("https://www.dmm.co.jp/my/")
             .data("login_id", userId)
             .data("password", password)
             .data("sava_password", "1")
             .data("save_login_id", "1")
             .data("act", "commit")
             .method(Method.POST)
             .execute();
     String sesId = res.cookie(SESSION_ID_KEY);
     LOGGER.info("sessionId={}", sesId);
     this.sessionId = sesId;
   } catch (SocketTimeoutException e) {
     LOGGER.warn("login failed", e);
     generateSessionIdRetry(3000);
   } catch (SSLHandshakeException e) {
     LOGGER.warn("login failed", e);
     generateSessionIdRetry(3000);
   } catch (IOException e) {
     throw new EgetException("failed to login", e);
   }
 }
示例#15
0
  public static void processPage(String URL) throws SQLException, IOException {
    // check if the given URL is already in database
    String sql = "select * from Record where URL = '" + URL + "'";
    ResultSet rs = db.runSql(sql);
    if (rs.next()) {

    } else {
      // store the URL to database to avoid parsing again
      sql = "INSERT INTO  test.Record " + "(URL) VALUES " + "(?);";
      PreparedStatement stmt = db.conn.prepareStatement(sql, Statement.RETURN_GENERATED_KEYS);
      stmt.setString(1, URL);
      stmt.execute();

      // get useful information
      Document doc = Jsoup.connect("http://www.mit.edu/").get();

      if (doc.text().contains("PhD")) {
        System.out.println(URL);
      }

      // get all links and recursively call the processPage method
      Elements questions = doc.select("a[href]");
      for (Element link : questions) {
        if (link.attr("href").contains("mit.edu")) processPage(link.attr("abs:href"));
      }
    }
  }
示例#16
0
 public static String getLatestChangeLog() throws IOException {
   String toReturn =
       Jsoup.connect(URL_LATEST_CHANGE_LOG).followRedirects(false).execute().body().trim();
   Log.d(LOG_TAG, "getLatestChangeLog changeLog: " + toReturn);
   if (toReturn.toLowerCase().contains("<html>")) throw new IOException("Wrong page loaded");
   return toReturn;
 }
示例#17
0
 public void init() {
   String url = "http://www.thjnpx.org/cms/";
   // url="http://www.taqpx.org/cms/";
   // url="http://www.jyzdy.org/zgclCMS/";
   // url = "http://www.tsinghua.edu.cn/publish/newthu/index.html";
   Document doc = null;
   int i = 1;
   while (doc == null && i < 4) {
     try {
       doc = Jsoup.connect(url).get();
     } catch (IOException e) {
       System.out.println("连接超时次数:" + i);
     }
     i++;
   }
   if (doc == null) {
     return;
   }
   System.out.println("--------------------分析中--------------------------------");
   title = doc.title();
   System.out.println("网站链接:" + url);
   System.out.println("网站标题:" + title);
   // System.out.println("html----------:"+doc.body().html());
   deleteComent(null);
   Element body = doc.body();
   getChildElement(body, 0);
   isInit = true;
 }
示例#18
0
 public static String getType(Document doc) {
   String type = "";
   if (doc.select("#kw").size() > 0) {
     Element e = doc.select("#kw").get(0);
     StringBuilder typeURL = new StringBuilder();
     typeURL.append("http://widget.unistats.ac.uk/Widget/");
     typeURL.append(e.attr("data-institution") + "/");
     typeURL.append(e.attr("data-course") + "/");
     typeURL.append(e.attr("data-orientation") + "/");
     typeURL.append("null/");
     typeURL.append(e.attr("data-language") + "/");
     typeURL.append(e.attr("data-kismode"));
     boolean finishe = false;
     try {
       do {
         Connection tmpConn = Jsoup.connect(typeURL.toString());
         Document tmpDoc = tmpConn.timeout(10000).get();
         if (tmpDoc.select("#kisWidget > div.widgetCourse > h1").size() > 0) {
           e = tmpDoc.select("#kisWidget > div.widgetCourse > h1").get(0);
           type =
               e.text().trim().indexOf(" ") > 0
                   ? e.text().trim().substring(0, e.text().trim().indexOf(" "))
                   : e.text().trim();
         }
         finishe = true;
       } while (!finishe);
     } catch (Exception ex) {
       ex.printStackTrace();
     }
   }
   return type;
 }
示例#19
0
  public List<MenuMeal> getMenuMeals(int number) {
    Document doc = null;
    List<MenuMeal> meals = new ArrayList<>();

    try {
      doc =
          Jsoup.connect(String.format(URL, number))
              .userAgent("Chrome/49.0.2623.112")
              .referrer("https://www.google.ru/")
              .timeout(7000)
              .get();
    } catch (IOException e) {
      e.printStackTrace();
    }
    if (doc == null) return meals;

    Elements elements = doc.select("td[width=400");

    if (!elements.isEmpty()) {
      for (Element element : elements) {
        Element parent = element.parent();
        MenuMeal menuMeal = new MenuMeal();

        menuMeal.setDescription(parent.select("div[id=ssilka]").first().text());
        String cost = parent.select("div[id=ssilka]").last().text();
        menuMeal.setCost(Integer.valueOf(cost.substring(0, cost.indexOf("-"))));

        meals.add(menuMeal);
      }
      return meals;
    } else {
      return meals;
    }
  }
示例#20
0
  @Override
  public String getCiteItem() {
    String baseurl = "http://pubs.rsc.org/en/content/getformatedresult/";

    String doi = null;
    String posturl = null;
    try {
      Document doc = Jsoup.connect(url).timeout(30000).get();
      doi = doc.select("input#DOI").attr("value");
      posturl = baseurl + doi.toLowerCase() + "?downloadtype=article";
    } catch (UnsupportedEncodingException e2) {
      e2.printStackTrace();
      return null;
    } catch (IOException e2) {
      e2.printStackTrace();
      return null;
    }

    HttpURLConnection con = null;
    try {
      String postParams = "ResultAbstractFormat=BibTex&go=";

      URL u = new URL(posturl);
      con = (HttpURLConnection) u.openConnection();
      con.setRequestMethod("POST");
      con.setDoOutput(true);
      con.setDoInput(true);
      con.setUseCaches(false);
      con.setRequestProperty("Content-Type", "application/x-www-form-urlencoded");
      con.setRequestProperty(
          "User-Agent", "Mozilla/5.0 (Windows NT 6.1; rv:37.0) Gecko/20100101 Firefox/37.0");

      @SuppressWarnings("resource")
      OutputStreamWriter osw = new OutputStreamWriter(con.getOutputStream(), "UTF-8");
      osw.write(postParams);
      osw.flush();
      osw.close();
    } catch (Exception e) {
      e.printStackTrace();
      return null;
    } finally {
      if (con != null) {
        con.disconnect();
      }
    }

    StringBuilder buffer = new StringBuilder();
    try {
      BufferedReader br = new BufferedReader(new InputStreamReader(con.getInputStream(), "UTF-8"));
      String temp;
      while ((temp = br.readLine()) != null) {
        buffer.append(temp);
        buffer.append("\n");
      }
    } catch (Exception e) {
      e.printStackTrace();
      return null;
    }
    return buffer.toString();
  }
示例#21
0
 public static Integer getLatestVersionCode() throws IOException, NumberFormatException {
   Integer toReturn =
       Integer.parseInt(
           Jsoup.connect(URL_LATEST_VERSION_CODE).followRedirects(false).execute().body().trim());
   Log.d(LOG_TAG, "getLatestVersionCode versionCode: " + toReturn);
   return toReturn;
 }
示例#22
0
  /** Mudah is not standardized, result will be messy if crawl them */
  @Override
  public List<Item> parse(String query, int size) throws IOException {

    // request for a page
    Document doc =
        Jsoup.connect("http://www.mudah.my/li?q=" + query)
            .userAgent(Constant.HTTP_USER_AGENT)
            .timeout(Constant.HTTP_TIMEOUT)
            .get();

    Elements listS = doc.select("div.listing_thumbs").first().select("div.list_ads");

    ArrayList<Item> result = new ArrayList<Item>(size);
    for (int i = 0; i < listS.size(); i++) {
      Element list = listS.get(i);

      String img = "";
      list.select("div.image_thumb");
      Elements imgS = list.select("div.image_thumb > a + img");
      if (imgS.size() < 0) { // some may not have images
        img = imgS.first().attr("href");
      }

      Element listE = list.select("li.listing_ads_title").first();
      String title = listE.child(0).text();
      String url = listE.child(0).attr("href");
      String price = listE.text();
      price = price.substring(price.lastIndexOf("RM") + 2).trim().replaceAll(" ", "");
      int dPrice = Integer.parseInt(price);

      result.add(new Item("Mudah", title, dPrice, img, url));
    }

    return result;
  }
 @Override
 public Elements fetchContent(String url, String pattern) throws IOException {
   Document doc = Jsoup.connect(url).get();
   Elements body = doc.select("body");
   Elements headlines = doc.select("body a[rel=bookmark]");
   return headlines;
 }
示例#24
0
  private static Response execute(
      String url, Method method, Map<String, String> cookies, Map<String, String> data) {
    Response response = null;

    Connection connection = Jsoup.connect(url);
    connection.method(method);

    connection.timeout(10000);
    connection.ignoreContentType(true);
    connection.maxBodySize(0);

    if (cookies != null) {
      connection.cookies(cookies);
    }

    if (data != null) {
      for (Entry<String, String> entry : data.entrySet()) {
        String key = entry.getKey();
        String value = entry.getValue();

        connection.data(key, value);
      }
    }

    try {
      response = connection.execute();
    } catch (IOException e) {
      e.printStackTrace();
    }

    return response;
  }
示例#25
0
文件: Worker.java 项目: m1/Parker
  public Worker(String url, boolean verbose) throws Exception {
    Document doc;
    doc = Jsoup.connect(url).get();
    // select anchors with href only
    Elements links = doc.select("a[href]");
    String l_Href;
    String host;
    int linksNum;
    Parser parser;
    for (Element link : links) {
      // absolute = http:// added
      l_Href = link.attr("abs:href");
      if (!l_Href.isEmpty()) {
        parser = new Parser(l_Href);
        host = parser.getHost();
        // if tempStats contains the url, add one to the value
        if (tempStats.containsKey(host)) {
          linksNum = tempStats.get(host);
          tempStats.put(host, linksNum += 1);
        }
        // if it doesn't, add it

        else {
          tempStats.put(host, 1);
        }
        // parse the url
        tempQueue.add(parser.getURL());
      }
    }
    if (verbose) {
      System.out.println(
          Thread.currentThread().getName() + " : " + tempQueue.size() + " links from " + url);
    }
  }
示例#26
0
 public static ArrayList<EntryModel> getPopularContent() {
   final ArrayList<EntryModel> result = new ArrayList<>();
   Thread thread =
       new Thread(
           () -> {
             try {
               Document document = Jsoup.connect("http://jkanime.net/").get();
               Elements elements = document.getElementsByClass("home_portada_bg");
               for (Element element : elements) {
                 result.add(
                     new EntryModel(
                         Constants.TYPE_SHOW,
                         element.getElementsByTag("a").first().text(),
                         element.getElementsByTag("a").first().attr("abs:href"),
                         element.getElementsByTag("img").first().attr("src")));
               }
             } catch (IOException e) {
               e.printStackTrace();
             }
           });
   thread.start();
   try {
     thread.join();
     return result;
   } catch (InterruptedException | NullPointerException e) {
     e.printStackTrace();
     return null;
   }
 }
示例#27
0
 public static ArrayList<EntryModel> getSearchResults(final String query) {
   final ArrayList<EntryModel> result = new ArrayList<>();
   Thread thread =
       new Thread(
           () -> {
             try {
               Document document =
                   Jsoup.connect("http://jkanime.net/buscar/" + query.replace(" ", "_")).get();
               Elements elements = document.getElementsByClass("search");
               for (Element element : elements) {
                 String title = element.getElementsByClass("titl").first().text();
                 String url = element.getElementsByClass("titl").first().attr("abs:href");
                 String picUrl = element.getElementsByTag("img").first().attr("src");
                 result.add(new EntryModel(Constants.TYPE_SHOW, title, url, picUrl));
               }
             } catch (IOException e) {
               e.printStackTrace();
             }
           });
   thread.start();
   try {
     thread.join();
     return result;
   } catch (InterruptedException | NullPointerException e) {
     e.printStackTrace();
     return null;
   }
 }
  @Override
  public List<String> parseCategory(String categoryName, String categoryURL) {
    // TODO Auto-generated method stub

    List<String> linksByCategoryList = null;

    try {

      Document doc = Jsoup.connect(categoryURL).timeout(Constants.MAX_DELAY_TIME * 1000).get();

      Elements links = doc.select("div[class=views-field views-field-title]").select("a");

      if (links != null && links.size() > 0) {

        linksByCategoryList = new ArrayList<String>();

        for (Element element : links) {

          String newsLink = element.attr("href");
          newsLink = newsLink.substring(1);

          linksByCategoryList.add(newsLink);
        }
      }

    } catch (IOException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }

    return linksByCategoryList;
  }
示例#29
0
    @Override
    protected String doInBackground(String... urls) {
      Document doc = null;

      try {
        Connection.Response res =
            Jsoup.connect(urls[0])
                .data("eid", mUserName)
                .timeout(3000)
                .data("pw", mPassword)
                .data("submit", "Login")
                .method(Method.POST)
                .execute();

        doc = res.parse();

        // get the cookie
        mCookieValue = res.cookie(COOKIE_TYPE);
        mLoginResponse = doc.toString();
        /*				Log.w("response", doc.toString());*/

      } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
      }
      return mLoginResponse;
    }
示例#30
-1
 @Override
 protected Object doInBackground(Object[] params) {
   try {
     Document feed =
         Jsoup.connect("https://gdata.youtube.com/feeds/api/videos/" + ID_VIDEO + "?v=2")
             .userAgent(
                 "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.22 (KHTML, like Gecko) Chrome/25.0.1364.172 Safari/537.22")
             .timeout(60000)
             .ignoreContentType(true)
             .get();
     values.setTitleVideo(feed.getElementsByTag("title").text());
     values.setDescriptionVideo(feed.getElementsByTag("media:description").text());
     values.setLikeVideo(feed.select("yt|rating").attr("numLikes"));
     values.setDislikeVideo(feed.select("yt|rating").attr("numDislikes"));
     values.setAverageVideo(feed.select("gd|rating").attr("average"));
     values.setViewsVideo(feed.select("yt|statistics").attr("viewCount"));
     values.setDurationVideo(feed.select("yt|duration").attr("seconds"));
     values.setThumbnailVideo(feed.select("media|thumbnail").first().attr("url"));
     values.setAuthorVideo(feed.getElementsByTag("name").text());
     values.setAuthorVideoId(feed.getElementsByTag("yt:uploaderId").text());
     values.setNumberCommentsVideo(feed.select("gd|feedLink").attr("countHint"));
     values.setFeedCommentsVideo(feed.select("gd|feedLink").attr("href"));
     values.setCategoryVideo(feed.select("media|category").attr("label"));
     values.setDateVideo(feed.getElementsByTag("yt:uploaded").text());
     Document image =
         Jsoup.connect(feed.getElementsByTag("uri").text())
             .userAgent(
                 "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.22 (KHTML, like Gecko) Chrome/25.0.1364.172 Safari/537.22")
             .timeout(60000)
             .ignoreContentType(true)
             .get();
     values.setThumbnailAuthor(image.select("media|thumbnail").attr("url"));
     if (showLog) {
       Log.d(TAG_LOG, feed.getElementsByTag("title").text());
       Log.d(TAG_LOG, feed.getElementsByTag("media:description").text());
       Log.d(TAG_LOG, feed.select("yt|rating").attr("numLikes"));
       Log.d(TAG_LOG, feed.select("yt|rating").attr("numDislikes"));
       Log.d(TAG_LOG, feed.select("gd|rating").attr("average"));
       Log.d(TAG_LOG, feed.select("yt|statistics").attr("viewCount"));
       Log.d(TAG_LOG, feed.select("yt|duration").attr("seconds"));
       Log.d(TAG_LOG, feed.select("media|thumbnail").first().attr("url"));
       Log.d(TAG_LOG, feed.getElementsByTag("name").text());
       Log.d(TAG_LOG, feed.getElementsByTag("yt:uploaderId").text());
       Log.d(TAG_LOG, feed.select("gd|feedLink").attr("countHint"));
       Log.d(TAG_LOG, feed.select("gd|feedLink").attr("href"));
       Log.d(TAG_LOG, feed.select("media|category").attr("label"));
       Log.d(TAG_LOG, feed.getElementsByTag("yt:uploaded").text());
       Log.d(TAG_LOG, image.select("media|thumbnail").attr("url"));
     }
   } catch (IOException e) {
     error = true;
     e.printStackTrace();
   }
   return null;
 }