public String classifyUrl(String pageURL) {
    try {
      URL url = new URL(pageURL);
      Proxy proxy = new Proxy(Proxy.Type.HTTP, new InetSocketAddress("202.141.80.22", 3128));
      Authenticator authenticator =
          new Authenticator() {
            public PasswordAuthentication getPasswordAuthentication() {
              return (new PasswordAuthentication("b.revanth", "batman9903".toCharArray()));
            }
          };
      Authenticator.setDefault(authenticator);
      URLConnection urlConnection = url.openConnection(proxy);
      urlConnection.connect();

      String line = null;
      StringBuffer webPageBuffer = new StringBuffer();
      BufferedReader inputReader =
          new BufferedReader(new InputStreamReader(urlConnection.getInputStream()));
      while ((line = inputReader.readLine()) != null) {
        webPageBuffer.append(line);
      }

      Document document = Jsoup.parse(String.valueOf(webPageBuffer), "UTF-8");
      Elements title = document.select("title");
      Elements body = document.select("body");

      Log.i("Now Classifying ", pageURL);
      String assignedClass = classifyText(title.text() + "\n" + body.text());
      return assignedClass;
    } catch (IOException e) {
      Log.i("Error:", e.toString());
      e.printStackTrace();
    }
    return "N/A";
  }
Esempio n. 2
0
  public Holder doParse(String html, String url) {
    Holder holder = new Holder();
    holder.url = url;

    Document doc = Jsoup.parse(html, url);
    Elements typeElement =
        doc.select("body > div.main_w.clearfix > div.main.clearfix > ul > li:nth-child(5) > a");
    holder.dishType = typeElement.text();

    Elements titleElement =
        doc.select(
            "body > div.main_w.clearfix > div.main.clearfix > div.cp_header.clearfix > div.cp_main_info_w > div.info1 > h1 > a");

    holder.title = titleElement.text();

    Elements methodElement =
        doc.select(
            "body > div.main_w.clearfix > div.main.clearfix > div.cp_header.clearfix > div.cp_main_info_w > div.info2 > ul > li:nth-child(1) > a");
    holder.method = methodElement.text();

    Elements materialElement =
        doc.select(
            "body > div.main_w.clearfix > div.main.clearfix > div.cp_body.clearfix > div.cp_body_left > div.materials > div > div.yl.zl.clearfix > ul > li > div > h4 > a");

    holder.mainMaterial = materialElement.text();

    Elements stepE =
        doc.select(
            "body > div.main_w.clearfix > div.main.clearfix > div.cp_body.clearfix > div.cp_body_left > div.measure > div.editnew.edit > div.content.clearfix");
    //
    // body > div.main_w.clearfix > div.main.clearfix > div.cp_body.clearfix
    // > div.cp_body_left > div.measure > div.editnew.edit >
    // div.content.clearfix
    // body > div.main_w.clearfix > div.main.clearfix > div.cp_body.clearfix
    // > div.cp_body_left > div.measure > div.edit > p:nth-child(1) > em
    //

    if (stepE.size() == 0) {
      stepE =
          doc.select(
              "body > div.main_w.clearfix > div.main.clearfix > div.cp_body.clearfix > div.cp_body_left > div.measure > div.edit > p");
    }

    for (int i = 0; i < stepE.size(); i++) {
      Element e = stepE.get(i);

      if (e.children().hasClass("step")) {
        String step = e.text();
        if (!"".equals(step)) {
          holder.steps.add(step);
        }
      }
    }
    // body > div.main_w.clearfix > div.main.clearfix > div.cp_body.clearfix
    // > div.cp_body_left > div.measure > div.editnew.edit >
    // div:nth-child(1)

    return holder;
  }
Esempio n. 3
0
  // Busca os endereços pelo número do CEP.
  public List<Address> getByCep(String cep) throws IOException {

    listEnderecos = new ArrayList<Address>();

    // mapeamento dos parametros que será passado na requisição
    Map<String, String> query = new HashMap<String, String>();
    query.put("CEP", cep);
    query.put("Metodo", "listaLogradouro");
    query.put("TipoConsulta", "cep");
    query.put("StartRow", "1");
    query.put("EndRow", "10");

    // Faz uma requisição no site do correios (www.buscacep.com.br) com Json, passando os parametros
    // mapeados,
    // requisição deverá ser do tipo post.
    // Armazena o retorno em uma variavel doc.
    Document doc =
        Jsoup.connect(Utils.adressCorreios)
            .data(query)
            .header("Origin", "http://www.buscacep.correios.com.br")
            .header("Referer", "http://www.buscacep.correios.com.br")
            .post();

    // Acessa o retorno do doc e percorre o resultado buscando as informações dos endereços
    // Armazena os resultados na lista de endereços criadas e retorna a mesma para que outras
    // classes possam acessar.
    Elements elements = doc.select("table").eq(2);
    Elements rows = elements.select("tr");

    Iterator<Element> rowIterator = rows.iterator();

    while (rowIterator.hasNext()) {
      Address enderecos = new Address();

      Element element = rowIterator.next();

      Elements logradouro = element.children().select("td").eq(0);
      enderecos.setLogradouro(logradouro.text());
      Elements bairro = element.children().select("td").eq(1);
      enderecos.setBairro(bairro.text());
      Elements cidade = element.children().select("td").eq(2);
      Elements estado = element.children().select("td").eq(3);
      StringBuilder sbLocalidade = new StringBuilder();
      sbLocalidade.append(cidade.text());
      sbLocalidade.append("/");
      sbLocalidade.append(estado.text());
      enderecos.setLocalidade(sbLocalidade.toString());
      Elements codigopostal = element.children().select("td").eq(4);
      enderecos.setCEP(codigopostal.text());

      listEnderecos.add(enderecos);
    }

    return listEnderecos;
  }
  private int getPageNum() {
    setCurrentPage(1);
    Document doc = WebScrappUtil.post(url, argMap);
    Elements aa = doc.select("p[class=page_info]");

    String[] pageString = aa.text().split("]");

    logger.info("aa:{}", aa.text());
    int indexname = pageString[0].lastIndexOf("/");
    String pageNum = pageString[0].substring(indexname + 1, pageString[0].length());

    return Integer.valueOf(pageNum);
  }
Esempio n. 5
0
  @Override
  public Group call() throws Exception {
    Group result = null;
    // Gets every URL you send in
    URL url = new URL(currentUrl);
    // Reads what the url contains
    LineNumberReader in = new LineNumberReader(new InputStreamReader(url.openStream()));

    // Using Jsoup to scrape the data from the urls
    Document doc = Jsoup.connect(currentUrl).get();
    Elements currAuthors = doc.select("#authors");
    String authors = currAuthors.text();
    Elements currClass = doc.select("#class");
    String myClass = currClass.text();
    Elements currGroup = doc.select("#group");
    String group = currGroup.text();

    // Simple sout to check if i got the correct data out
    //        System.out.println("authors: " + authors);
    //        System.out.println("class: " + myClass);
    //        System.out.println("group: " + group);
    try {
      String line = null;
      while ((line = in.readLine()) != null) {
        Group myGroup = new Group();

        // Adds the data to myGroup object
        myGroup.setAuthors(authors);
        myGroup.setMyClass(myClass);
        myGroup.setGroup(group);

        //                System.out.println("\nGroup sout:");
        //                System.out.println("Group" + myGroup.getAuthors());
        //                System.out.println("Group" + myGroup.getGroup());
        //                System.out.println("Group" + myGroup.getMyClass());
        result = myGroup;
        // result = "\nAuthors: " + authors + " Class: " + myClass + " Group: " + group + " \n---
        // from group class: "
        //        + myGroup.getAuthors() + " " + myGroup.getGroup() + " " + myGroup.getMyClass() +
        // "\n";
      }
    } finally {
      in.close();
    }

    // System.out.println(currentUrl + result.getGroup());
    return result;
  }
Esempio n. 6
0
 private String parseReplace(String query) {
   String value = getValue(query, REPLACE_TAG);
   String[] pairs = value.split("##");
   outputNodeInfo();
   String result = elements.text();
   for (String pair : pairs) {
     if (!isUniqueValue(pair, "=")) {
       throw new IllegalArgumentException("Argument selector part: " + query + " is illegal");
     }
     if (pair.indexOf("=") == 0) {
       throw new IllegalArgumentException("Argument selector part: " + query + " is illegal");
     }
     String[] pairMap = pair.split("=");
     String arg1 = null;
     String arg2 = null;
     if (pairMap.length == 1) {
       arg1 = replaceSpacePlaceHolder(replaceEqualPlaceHolder(pairMap[0]));
       arg2 = "";
     } else {
       arg1 = replaceSpacePlaceHolder(replaceEqualPlaceHolder(pairMap[0]));
       arg2 = replaceSpacePlaceHolder(replaceEqualPlaceHolder(pairMap[1]));
     }
     result = result.replace(arg1, arg2);
   }
   return result;
 }
  @Override
  public Article run(HtmlObject htmlObject) {
    String html = htmlObject.getHtml();
    Document doc = Jsoup.parse(html);
    String title = doc.select(".article h1").text();
    Elements contentElement = doc.select(".article_con");
    String content = "";
    String contentHtml = "";
    if (contentElement != null) {
      // contentElement.select(".author").remove();
      content = contentElement.text();
      contentHtml = contentElement.html();
    }

    String Ele_data = doc.select(".article h2").text();
    Matcher m1 = datePattern.matcher(Ele_data);
    String date = "";
    if (m1.find()) {
      date = m1.group(1);
    } else {
      Date today = new Date();
      SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd");
      date = formatter.format(today);
    }

    Article model1 = new Article();
    model1.setUrl(htmlObject.getUrl());
    model1.setTitle(title);
    model1.setContent(content);
    model1.setPublishDate(date);
    model1.setArticleType(ArticleType.News);
    model1.setProvider("雨果网");
    return model1;
  }
  /**
   * achieve the num of people him/her fellowed
   *
   * @param doc
   * @return
   */
  private String getFellowPeopleNum(Document doc) {
    Elements friendHtml = doc.select("div[id=\"friend\"]");
    Elements fellowPeopleNumHtml = null;

    if (friendHtml != null) {
      fellowPeopleNumHtml = friendHtml.select("a");
      // 关注人数
      if (fellowPeopleNumHtml != null) {
        String fellowPeopleNum =
            UtilsMethod.findFirstStringByRegex("成员[0-9]+", fellowPeopleNumHtml.text());
        if (fellowPeopleNum != null) {
          fellowPeopleNum = fellowPeopleNum.replaceAll("[\\D]+", "");
          if (fellowPeopleNum != null) {
            return fellowPeopleNum;
          } else {
            return null;
          }
        } else {
          return null;
        }
      } else {
        return null;
      }
    } else {
      return null;
    }
  }
Esempio n. 9
0
 public void setContent(String content) {
   this.content = content;
   Document doc = Jsoup.parse(content);
   Elements docTitle = doc.select("title");
   if (docTitle.hasText()) {
     this.title = docTitle.text();
   }
 }
    private boolean updateDailyNews(Document doc, String dailyTitle, DailyNews dailyNews)
        throws JSONException {
      Elements viewMoreElements = doc.getElementsByClass("view-more");

      if (viewMoreElements.size() > 1) {
        dailyNews.setMulti(true);
        Elements questionTitleElements = doc.getElementsByClass("question-title");

        for (int j = 0; j < viewMoreElements.size(); j++) {
          if (questionTitleElements.get(j).text().length() == 0) {
            dailyNews.addQuestionTitle(dailyTitle);
          } else {
            dailyNews.addQuestionTitle(questionTitleElements.get(j).text());
          }

          Elements viewQuestionElement = viewMoreElements.get(j).select("a");

          if (viewQuestionElement.text().equals("查看知乎讨论")) {
            dailyNews.addQuestionUrl(viewQuestionElement.attr("href"));
          } else {
            return false;
          }
        }
      } else if (viewMoreElements.size() == 1) {
        dailyNews.setMulti(false);

        Elements viewQuestionElement = viewMoreElements.select("a");
        if (viewQuestionElement.text().equals("查看知乎讨论")) {
          dailyNews.setQuestionUrl(viewQuestionElement.attr("href"));
        } else {
          return false;
        }

        // Question title is the same with daily title
        if (doc.getElementsByClass("question-title").text().length() == 0) {
          dailyNews.setQuestionTitle(dailyTitle);
        } else {
          dailyNews.setQuestionTitle(doc.getElementsByClass("question-title").text());
        }
      } else {
        return false;
      }

      return true;
    }
 /**
  * achieve the person name on douban
  *
  * @param doc
  * @return
  */
 private String getName(Document doc) {
   Elements nameHtml = doc.select("title");
   // 姓名
   if (nameHtml != null) {
     return nameHtml.text();
   } else {
     return null;
   }
 }
 /**
  * achieve the person join douban date
  *
  * @param doc
  * @return
  */
 private String getJoinDate(Document doc) {
   Elements joinDateHtml = doc.select("div[class=\"user-info\"] div[class=\"pl\"]");
   if (joinDateHtml != null) {
     return UtilsMethod.findFirstStringByRegex(
         "[0-9]{4}\\-[0-9]{2}\\-[0-9]{2}", joinDateHtml.text());
   } else {
     return null;
   }
 }
  @Override
  public NewsEntity parseNewsPerCategory(String newsURL) {
    // TODO Auto-generated method stub

    NewsEntity parsetData = null;

    try {
      Document doc = Jsoup.connect(newsURL).timeout(Constants.MAX_DELAY_TIME * 1000).get();
      doc.outputSettings().charset(Charset.forName("UTF-8"));
      doc.normalise();

      Elements titleElement = doc.select("title");
      String titleName = titleElement.text();

      if (titleName.contains("|")) {
        titleName = titleName.substring(0, titleName.indexOf("|")).trim();
      }

      Elements newsElements = doc.select("div[class=content]").select("p");
      newsElements.select("a, img, script, xml, input, label, textarea").remove();

      if (newsElements != null) {

        try {
          parsetData = new NewsEntity();
          parsetData.setNewsTitle(titleName);
          parsetData.setNewsBody(newsElements.text());
          parsetData.setNewsURL(newsURL);

          // System.out.println("URL: " + newsURL + " HASH: " +
          // NewsAggregatorUtility.StringToSHA1Hash(newsURL));
        } catch (Exception e) {
          // TODO Auto-generated catch block
          e.printStackTrace();
        }
      }

    } catch (IOException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }

    return parsetData;
  }
Esempio n. 14
0
  public static void main(String[] args) {

    // String my_url_to_fetch =
    // "http://www.cdiscount.com/electromenager/tous-nos-accessoires/joint-hublot-d-30-30-cm/f-11029-ind3662734065501.html#mpos=2|mp";
    // String my_url_to_fetch =
    // "http://www.cdiscount.com/le-sport/vetements-de-sport/kappa-survetement-armor-homme/f-121020526-3025ej0005.html#mpos=1|cd";
    // String my_url_to_fetch =
    // "http://www.cdiscount.com/animalerie/chiens/lot-de-3-sofas-pour-chien/f-1621004-ifd19945rouge.html";
    // String my_url_to_fetch = "http://www.cdiscount.com/telephonie/r-housse+guidon.html#_his_";
    String my_url_to_fetch =
        "http://www.cdiscount.com/maison/tapis/rio-tapis-shaggy-anthracite-30-mm-160x230-cm/f-1172512-r252an160230.html";

    // fetching data using jQuery
    org.jsoup.nodes.Document doc;
    try {
      // we wait between 30 and 70 seconds
      doc =
          Jsoup.connect(my_url_to_fetch)
              .userAgent(
                  "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-GB;     rv:1.9.2.13) Gecko/20101203 Firefox/3.6.13 (.NET CLR 3.5.30729)")
              .referrer("accounterlive.com")
              .ignoreHttpErrors(true)
              .timeout(0)
              .get();

      Elements titleel = doc.select("title");
      System.out.println(titleel.text());

      doc =
          Jsoup.connect(my_url_to_fetch)
              .userAgent(
                  "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-GB;     rv:1.9.2.13) Gecko/20101203 Firefox/3.6.13 (.NET CLR 3.5.30729)")
              .referrer("accounterlive.com")
              .ignoreHttpErrors(true)
              .timeout(0)
              .get();

      Elements titleel2 = doc.select("title");
      System.out.println(titleel2.text());
    } catch (IOException e) {
      e.printStackTrace();
    }
  }
Esempio n. 15
0
    @Override
    protected Void doInBackground(Void... params) {

      try {
        Document doc = Jsoup.connect(link).ignoreContentType(true).get();
        Elements titles = doc.select("span.title2");
        title = titles.text();
        Elements image = doc.select("img.news-record-thumbnail");
        img_src = image.attr("src");
        Elements p = doc.select("div.news-block-justify").select("p");
        for (Element item : p) {
          description_text += item.text() + "\n\n";
        }
        Elements date = doc.select("span.title");
        DateAdded = date.text();
      } catch (Exception ex) {
        ex.printStackTrace();
      }
      return null;
    }
Esempio n. 16
0
 @SuppressWarnings("unchecked")
 public List<String> selectListResult(String separator) {
   Object result = getResult(resultQuery);
   if (result instanceof Elements) {
     Elements eles = (Elements) result;
     if (eles.size() > 1) {
       return Arrays.asList(eles.text().split(DEFAULT_SEPARATOR));
     }
     if (eles.size() == 1 && !StringUtils.isEmpty(separator)) {
       return Arrays.asList(eles.first().text().split(separator));
     }
     return ListKit.of(eles.text());
   } else if (result instanceof String) {
     String string = (String) result;
     return Arrays.asList(string.split(separator));
   } else if (result instanceof List) {
     return (List<String>) result;
   }
   return null;
 }
Esempio n. 17
0
  @Override
  protected List<String> getProductBrandFromHTML(Document doc) throws Exception {

    List<String> list = new ArrayList<String>();
    Elements brand =
        doc.select("div.main-product")
            .select("div.detail-column")
            .select("h1.component")
            .select("a");
    String brandOfproduct = brand.text().toLowerCase().trim();
    list.add(brandOfproduct);
    return list;
  }
Esempio n. 18
0
 public String leituraJxr()
     throws IOException { // método para pegar os nomes dos métodos declarados
   Elements elements = document.getElementsByTag("pre");
   elements.select("a.jxr_linenumber").remove();
   // elements.select("strong.jxr_keyword").remove();
   // elements.select("span.jxr_string").remove();
   // elements.select("em.jxr_comment").remove();
   for (Element children : elements) {
     children.getElementsByClass("jxr_comment").remove();
     children.getElementsByClass("jxr_javadoccomment").remove();
   }
   return elements.text(); // retorna o código sem lixo
 }
Esempio n. 19
0
  // Busca o Cep pelo logradouro.
  public List<String> getByAdress(String address) throws IOException {

    listAddress = new ArrayList<String>();

    // mapeamento dos parametros que será passado na requisição
    Map<String, String> query = new HashMap<String, String>();

    query.put("relaxation", address);
    query.put("TipoCep", "ALL");
    query.put("semelhante", "N");
    query.put("cfm", "1");
    query.put("Metodo", "listaLogradouro");
    query.put("TipoConsulta", "relaxation");
    query.put("StartRow", "1");
    query.put("EndRow", "10");

    // Faz uma requisição no site do correios (www.buscacep.com.br) com Json, passando os parametros
    // mapeados,
    // requisição deverá ser do tipo post.
    // Armazena o retorno em uma variavel doc.
    Document doc =
        Jsoup.connect(Utils.adressCorreios)
            .timeout(20000)
            .data(query)
            .header("Origin", "http://www.buscacep.correios.com.br")
            .header("Referer", "http://www.buscacep.correios.com.br")
            .post();

    // Acessa o retorno do doc e percorre o resultado buscando as informações de Cep de acordo com o
    // endereço passado.
    // Armazena os resultados na lista criada e retorna a mesma para que outras classes possam
    // acessar
    Elements elements = doc.select("table").eq(2);
    Elements rows = elements.select("tr");

    Iterator<Element> rowIterator = rows.iterator();

    while (rowIterator.hasNext()) {
      Address enderecos = new Address();

      Element element = rowIterator.next();

      Elements codigopostal = element.children().select("td").eq(4);

      enderecos.setCEP(codigopostal.text());

      listAddress.add(enderecos.getCEP());
    }

    return listAddress;
  }
Esempio n. 20
0
  public static void main(String[] args) throws Exception {
    // 第一步:访问页面
    String url = "http://www.huxiu.com/article/102062/1.html";
    Document document = Jsoup.connect(url).get();

    // 第二步:解析页面
    Elements titleElements = document.getElementsByTag("title");
    String title = titleElements.get(0).text();

    Elements elements = document.select("div #article_content");
    String content = elements.text();

    // 第三步:打印
    System.out.println("title:" + title);
    System.out.println("content:" + content);
  }
Esempio n. 21
0
  private void getDatafromJsoup(String url) {
    // TODO Auto-generated method stub
    try {
      Document doc = Jsoup.connect(url).get();
      // Elements content = doc.getElementsByClass("cell item");

      Elements header = doc.getElementsByClass("topic_content");

      Log.e("topic_content", header.text());
      title = header.text();

      Elements content = doc.getElementsByTag("tbody");
      for (Element link : content) {

        DetailEntity entity = new DetailEntity();

        Elements avatar = link.getElementsByTag("img");
        {
          String avaterLink = avatar.attr("src");
          if (avaterLink.startsWith("//cdn.")) {
            entity.setAvater("http:" + avaterLink);
          }
        }

        Elements reply_content = link.getElementsByClass("reply_content");

        Log.e("reply_content", reply_content.text());

        entity.setReply_count(reply_content.text());

        Elements title = link.getElementsByTag("a");
        if (title.attr("href").startsWith("/member/")) {

          Log.e("title", title.text());
          entity.setTitle(title.text());
        }

        Log.e(
            "other",
            link.getElementsByClass("fade small").text()
                + link.getElementsByClass("small fade").text());
        if (!TextUtils.isEmpty(reply_content.text())) entities.add(entity);
      }

    } catch (IOException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }
  }
 /**
  * achieve the num of people fellow him/her
  *
  * @param doc
  * @return
  */
 private String getFellowedPeopleNum(Document doc) {
   Elements friendHtml = doc.select("p[class=\"rev-link\"]");
   if (friendHtml != null) {
     String fellowedPeopleNum =
         UtilsMethod.findFirstStringByRegex("被[0-9]+人关注", friendHtml.text());
     if (fellowedPeopleNum != null) {
       fellowedPeopleNum = fellowedPeopleNum.replaceAll("[\\D]+", "");
       if (fellowedPeopleNum != null) {
         return fellowedPeopleNum;
       } else {
         return null;
       }
     } else {
       return null;
     }
   } else {
     return null;
   }
 }
Esempio n. 23
0
 @SuppressWarnings("unchecked")
 public <T> T selectResult(Class<T> clazz) {
   if (clazz == null) {
     throw new IllegalArgumentException("Argument clazz cannot be null");
   }
   Object result = getResult(resultQuery);
   String string = "";
   if (result instanceof Elements) {
     Elements eles = (Elements) result;
     string = eles.text();
   } else if (result instanceof String) {
     string = (String) result;
   }
   if (String.class.equals(clazz)) {
     return (T) string;
   } else if (Integer.class.equals(clazz)) {
     string = NumberKit.removeNotNumber(string);
   }
   return new Gson().fromJson(string, clazz);
 }
 /**
  * get want to listen music list
  *
  * @param obssinHtml
  * @return
  */
 private List<String> getWantMusic(Elements obssinHtml) {
   if (obssinHtml == null) {
     return null;
   }
   for (Element books : obssinHtml) {
     Elements substatus = books.select("div[class=\"substatus\"]");
     if (substatus.text().equals("想听")) {
       Elements wantMusicHtml = books.select("li[class=\"aob\"] a");
       if (wantMusicHtml != null) {
         List<String> wantMusic = new ArrayList<String>();
         for (Element music : wantMusicHtml) {
           wantMusic.add(music.attr("title"));
         }
         return wantMusic;
       } else {
         return null;
       }
     }
   }
   return null;
 }
 /**
  * get the movie which watched
  *
  * @param obssinHtml
  * @return
  */
 private List<String> getWatchedMovie(Elements obssinHtml) {
   if (obssinHtml == null) {
     return null;
   }
   for (Element books : obssinHtml) {
     Elements substatus = books.select("div[class=\"substatus\"]");
     if (substatus.text().equals("看过")) {
       Elements watchedMovieHtml = books.select("li[class=\"aob\"] a");
       if (watchedMovieHtml != null) {
         List<String> watchedMovie = new ArrayList<String>();
         for (Element movie : watchedMovieHtml) {
           watchedMovie.add(movie.attr("title"));
         }
         return watchedMovie;
       } else {
         return null;
       }
     }
   }
   return null;
 }
 /**
  * get the want to read book list
  *
  * @param obssinHtml
  * @return
  */
 private List<String> getWantBook(Elements obssinHtml) {
   if (obssinHtml == null) {
     return null;
   }
   for (Element books : obssinHtml) {
     Elements substatus = books.select("div[class=\"substatus\"]");
     if (substatus.text().equals("想读")) {
       Elements wantBookHtml = books.select("li[class=\"aob\"] a img");
       if (wantBookHtml != null) {
         List<String> wantBook = new ArrayList<String>();
         for (Element book : wantBookHtml) {
           wantBook.add(book.attr("alt"));
         }
         return wantBook;
       } else {
         return null;
       }
     }
   }
   return null;
 }
Esempio n. 27
0
  /**
   * 检查返回内容是否错误
   *
   * @param html
   * @throws RespUrlException
   * @throws ResourceNotFountException
   */
  private void checkRespHaveAlertError(String html) throws IOException {
    Document doc = Jsoup.parse(html);
    Elements htmlEle = doc.select("html");
    String key = null;
    if (!htmlEle.isEmpty()) {
      key = doc.select("html").first().attr("xmlns:wb");
    }
    Elements eles = doc.select("div.alert-error");
    String errMsg = eles.text();

    if (StringUtils.indexOf(errMsg, "影片暂时不可以访问") != -1) { // 链接资源错误
      throw new ResourceNotFountException(errMsg);
    } else if (StringUtils.indexOf(errMsg, "遇到一个错误了") != -1) {
      throw new ResourceNotFountException(errMsg);
    } else {
      if (!eles.isEmpty()) {
        throw new RespUrlException("请求页面结果错误");
      } else if (!"http://open.weibo.com/wb".equals(key)) {
        throw new RespUrlException("代理返回结果错误");
      }
    }
  }
Esempio n. 28
0
  /**
   * Parsing existing entities
   *
   * @param pEntity
   */
  public void parseIndividualEnt(PersistentEntity pEntity) {
    Document doc = null;
    Iterator<Entry<Utf8, Utf8>> it = pEntity.getSameAs().entrySet().iterator();
    while (it.hasNext()) {
      Map.Entry<Utf8, Utf8> pairs = (Map.Entry<Utf8, Utf8>) it.next();

      // Reading individual URLs
      LOGGER.info(
          "Parsing entity from: " + ParserUtils.getUri(pairs.getKey().toString()).toASCIIString());
      doc =
          ParserUtils.connectGetUrl(ParserUtils.getUri(pairs.getKey().toString()).toASCIIString());
      if (doc == null && !validateSite(doc)) {
        break;
      } else {
        doc.setBaseUri(VejaSaoPauloParser.DEFAULT_VSP_URL);
        StringBuilder strBuilder = new StringBuilder();
        // getting working hours
        Elements workElems =
            doc.select("div[class*=information-unwanted]").select("div[class*=working-hours]");
        if (workElems != null && workElems.size() > 0) {
          for (Element info : workElems.select("div[class*=hours]").select("p"))
            strBuilder.append(info.text().replace("-", "_")).append(ParserProperties.INFO_SEP);
          pEntity.setSchedule(new Utf8(strBuilder.toString()));
        }

        // getting price range
        workElems =
            doc.select("div[class*=information-unwanted]")
                .select("div[class*=price]")
                .select("p[class*=price-range]");
        strBuilder.delete(0, strBuilder.length());
        if (workElems != null && workElems.size() > 0) {
          strBuilder.append(
              doc.select("div[class*=price]").select("h3").first().text()
                  + ParserProperties.DESC_SEP);
          strBuilder.append(workElems.text());
          pEntity.addToExtraInfo(new Utf8(strBuilder.toString()));
          // LOGGER.debug(strBuilder.toString());
        }

        // getting payment information
        workElems =
            doc.select("div[class*=information-unwanted]")
                .select("div[class*=payment]")
                .select("p");
        strBuilder.delete(0, strBuilder.length());
        if (workElems != null && workElems.size() > 0) {
          strBuilder.append(
              doc.select("div[class*=payment]").select("h3").first().text()
                  + ParserProperties.DESC_SEP);
          for (Element infoElem : workElems)
            if (!infoElem.text().trim().equals("")) {
              strBuilder.append(infoElem.text().trim() + ParserProperties.INFO_SEP);
            }
        } // END-IF_PAYMENT

        // getting services provided information
        workElems =
            doc.select("div[class*=information-unwanted]")
                .select("div[class*=services]")
                .select("div[class*=information-services]")
                .select("p");
        strBuilder.delete(0, strBuilder.length());
        if (workElems != null && workElems.size() > 0) {
          for (Element infoElem : workElems) {
            if (infoElem.hasClass("observation")) {
              pEntity.addToExtraInfo(new Utf8("Observation :" + infoElem.text()));
            } else if (!infoElem.text().equals("")) {
              pEntity.addToServices(new Utf8(infoElem.text()));
            }
            // LOGGER.debug(infoElem.text());
          }
        } // END-IF_SERVICES

        // getting home url
        workElems = doc.select("div[class*=information-unwanted]").select("div[class*=website]");
        if (workElems != null && workElems.size() > 0) {
          EylloLink homeLink =
              ParserUtils.detectUrl(
                  workElems
                      .select("div[class*=information-website]")
                      .select("p")
                      .select("a")
                      .first());
          if (homeLink != null) {
            pEntity.setHomepage(new Utf8(homeLink.getLinkHref()));
            pEntity.putToSameAs(new Utf8(homeLink.getLinkHref()), new Utf8(homeLink.getLinkText()));
          }
        } // END-IF_URL
        pEntity.setDescription(new Utf8(""));
      } // END-IF_VALID_URL
    } // END-WHILE
  }
Esempio n. 29
0
  private void doPost(String url, String replyText, String subject, String typeid) {

    String formhash = mInfo != null ? mInfo.getFormhash() : null;

    if (TextUtils.isEmpty(formhash)) {
      mResult = "发表失败,无法获取必要信息 !";
      mStatus = Constants.STATUS_FAIL;
      return;
    }

    Map<String, String> post_param = new HashMap<>();
    post_param.put("formhash", formhash);
    post_param.put("posttime", String.valueOf(System.currentTimeMillis()));
    post_param.put("wysiwyg", "0");
    post_param.put("checkbox", "0");
    post_param.put("message", replyText);
    for (String attach : mInfo.getAttaches()) {
      post_param.put("attachnew[" + attach + "][description]", attach);
    }
    for (String attach : mInfo.getAttachdel()) {
      post_param.put("attachdel[" + attach + "]", attach);
    }
    for (String attach : mInfo.getUnusedImages()) {
      post_param.put("attachdel[" + attach + "]", attach);
    }
    if (mMode == MODE_NEW_THREAD) {
      post_param.put("subject", subject);
      post_param.put("attention_add", "1");
      mTitle = subject;
    } else if (mMode == MODE_EDIT_POST) {
      if (!TextUtils.isEmpty(subject)) {
        post_param.put("subject", subject);
        mTitle = subject;
        if (!TextUtils.isEmpty(typeid)) {
          post_param.put("typeid", typeid);
        }
      }
    }

    SimpleErrorListener errorListener = VolleyHelper.getInstance().getErrorListener();
    String rsp_str = VolleyHelper.getInstance().synchronousPost(url, post_param, errorListener);

    // when success, volley will follow 302 redirect get the page content
    if (!TextUtils.isEmpty(rsp_str)) {
      String tid = "";
      if (rsp_str.contains("tid = parseInt('")) {
        tid = HttpUtils.getMiddleString(rsp_str, "tid = parseInt('", "'");
      }
      if (!TextUtils.isEmpty(tid)
          && TextUtils.isDigitsOnly(tid)
          && Integer.parseInt(tid) > 0
          && !rsp_str.contains("alert_info")) {
        mTid = tid;
        mResult = "发表成功!";
        mStatus = Constants.STATUS_SUCCESS;
      } else {
        Logger.e(rsp_str);
        mResult = "发表失败! ";
        mStatus = Constants.STATUS_FAIL;

        Document doc = Jsoup.parse(rsp_str);
        Elements error = doc.select("div.alert_info");
        if (error != null && error.size() > 0) {
          mResult += error.text();
        } else {
          if (HiSettingsHelper.getInstance().isErrorReportMode())
            ACRAUtils.acraReport("Error when posting but with response", rsp_str);
        }
      }
    } else {
      Logger.e(errorListener.getError());

      mResult = "发表失败,无返回结果! " + errorListener.getErrorText();
      mStatus = Constants.STATUS_FAIL;

      if (HiSettingsHelper.getInstance().isErrorReportMode())
        ACRAUtils.acraReport(errorListener.getError(), "no response");
    }
  }
  private void extractDataAsString(Document doc) throws FailedToFindElementException {
    Elements infoElements = doc.select(CSS_QUERY_TO_FIND_STOCKS_INFO);
    if (infoElements == null || infoElements.size() < 1) {
      throw new FailedToFindElementException("Cannot find stock info element.");
    }
    Element info = infoElements.get(0);
    if (info != null) {
      Elements dts = info.select("dt");
      if (dts != null) {
        stockCodeStr = dts.text().trim();
      }
      Elements category = info.select(".category");
      if (category != null) {
        sectorStr = category.text().trim();
      }
    }

    Elements stocksTables = doc.select(CSS_QUERY_TO_FIND_STOCKS_TABLE);
    if (stocksTables == null || stocksTables.size() < 1) {
      throw new FailedToFindElementException("Cannot find stock table element.");
    }
    Element stocksTable = stocksTables.get(0);

    Elements symbol = stocksTable.select(".symbol");
    if (symbol != null) {
      stockNameStr = symbol.text().trim();
    }

    Elements tds = stocksTable.select("td");
    for (Element td : tds) {
      String text = Util.normalizeRoundParentheses(td.text().trim());
      if (text.length() == 0) {
      } else if (td.classNames().contains("change")) {
        priceComparisonWithPreviousDayStr = text;
      } else {
        try {
          Double.parseDouble(Util.removeCommaAndNbsp(text));
        } catch (NumberFormatException e) {
          continue;
        }
        realtimePriceStr = text;
      }
    }

    Elements spans = doc.select(CSS_QUERY_IN_DETAIL_PAGE_TO_FIND_SPAN_UNDER_ID_DEAL);

    // マザーズ,札証,札幌ア,東証,東証1部, 東証2部,東証JQG,東証JQS,東証外国,福岡Q, 福証
    for (Element span : spans) {
      String s = span.text().trim();
      if (s.length() > 0
          && (s.indexOf("マ") >= 0
              || s.indexOf("札") >= 0
              || s.indexOf("東") >= 0
              || s.indexOf("福") >= 0)) {
        int index = s.indexOf('(');
        if (index >= 0) {
          s = s.substring(0, index);
        }
        marketStr = s;
        break;
      }
    }
    if (marketStr == null) {
      for (Element span : spans) {
        String s = span.text().trim();
        if (s.length() > 0 && !s.startsWith("(") && !s.equals("PTS") && !s.equals("OTC")) {
          int index = s.indexOf('(');
          if (index >= 0) {
            s = s.substring(0, index);
          }
          marketStr = s;
          break;
        }
      }
    }

    boolean isDebt = false;
    boolean isSelling = false;
    Elements dls = doc.select(CSS_QUERY_IN_DETAIL_PAGE_TO_FIND_ALL_DL);
    for (Element dl : dls) {
      Elements dt = dl.getElementsByTag("dt");
      Elements dd = dl.getElementsByTag("dd");
      String caption = dt.text().trim();
      String value = Util.normalizeRoundParentheses(dd.text().trim());

      if (caption.startsWith(CAPTION_PREVIOUS_CLOSING_PRICE)) {
        previousClosingPriceStr = value;
      } else if (caption.startsWith(CAPTION_OPENING_PRICE)) {
        openingPriceStr = value;
      } else if (caption.startsWith(CAPTION_HIGH_PRICE)) {
        highPriceStr = value;
      } else if (caption.startsWith(CAPTION_LOW_PRICE)) {
        lowPriceStr = value;
      } else if (caption.startsWith(CAPTION_TRADING_VOLUME_OF_STOCKS)) {
        tradingVolumeOfStocksStr = value;
      } else if (caption.startsWith(CAPTION_TRADING_VALUE_OF_MONEY)) {
        tradingValueOfMoneyStr = value;
      } else if (caption.startsWith(CAPTION_PRICE_LIMIT)) {
        priceLimitStr = value;
      } else if (caption.startsWith(CAPTION_MARKET_CAPITALIZATION)) {
        marketCapitalizationStr = value;
      } else if (caption.startsWith(CAPTION_OUTSTANDING_STOCK_VOLUME)) {
        outstandingStockVolumeStr = value;
      } else if (caption.startsWith(CAPTION_ANNUAL_INTEREST_RATE)) {
        annualInterestRateStr = value;
      } else if (caption.startsWith(CAPTION_DIVIDENDS_PER_SHARE)) {
        dividendsPerShareStr = value;
      } else if (caption.startsWith(CAPTION_PER)) {
        perStr = value;
      } else if (caption.startsWith(CAPTION_PBR)) {
        pbrStr = value;
      } else if (caption.startsWith(CAPTION_EPS)) {
        epsStr = value;
      } else if (caption.startsWith(CAPTION_BPS)) {
        bpsStr = value;
      } else if (caption.startsWith(CAPTION_MINIMUM_PURCHASE_AMOUNT)) {
        minimumPurchaseAmountStr = value;
      } else if (caption.startsWith(CAPTION_SHARE_UNIT_NUMBER)) {
        shareUnitNumberStr = value;
      } else if (caption.startsWith(CAPTION_YEARLY_HIGH)) {
        yearlyHighStr = value;
      } else if (caption.startsWith(CAPTION_YEARLY_LOW)) {
        yearlyLowStr = value;
      } else if (caption.startsWith(CAPTION_NET_ASSETS)) {
        netAssetsStr = value;
      } else if (caption.startsWith(CAPTION_UNIT_OF_TRADING)) {
        unitOfTradingStr = value;
      } else if (caption.startsWith(CAPTION_MANAGEMENT_COMPANY)) {
        managementCompanyStr = value;
      } else if (caption.startsWith(CAPTION_TYPE_OF_ASSETS_TO_BE_INVESTED)) {
        typeOfAssetsToBeInvestedStr = value;
      } else if (caption.startsWith(CAPTION_REGION_TO_BE_INVESTED)) {
        regionToBeInvestedStr = value;
      } else if (caption.startsWith(CAPTION_UNDERLYING_INDEX)) {
        underlyingIndexStr = value;
      } else if (caption.startsWith(CAPTION_SETTLEMENT_FREQUENCY)) {
        settlementFrequencyStr = value;
      } else if (caption.startsWith(CAPTION_SETTLEMENT_MONTH)) {
        settlementMonthStr = value;
      } else if (caption.startsWith(CAPTION_LISTED_DATE)) {
        listedDateStr = value;
      } else if (caption.startsWith(CAPTION_TRUST_FEE)) {
        trustFeeStr = value;
      } else if (caption.startsWith(CAPTION_MARGIN_DEBT_BALANCE)) {
        marginDebtBalanceStr = value;
        isDebt = true;
        isSelling = false;
      } else if (caption.startsWith(CAPTION_MARGIN_RATIO_COMPARISON_WITH_PREVIOUS_WEEK)) {
        if (isDebt) {
          marginDebtBalanceRatioComparisonWithPreviousWeekStr = value;
        } else if (isSelling) {
          marginSellingBalanceRatioComparisonWithPreviousWeekStr = value;
        }
      } else if (caption.startsWith(CAPTION_MARGIN_SELLING_BALANCE)) {
        marginSellingBalanceStr = value;
        isDebt = false;
        isSelling = true;
      } else if (caption.startsWith(CAPTION_RATIO_OF_MARGIN_BALANCE)) {
        ratioOfMarginBalanceStr = value;
      } else {
        if (!caption.equals("")
            && !caption.startsWith("値上がり率")
            && !caption.startsWith("値下がり率")
            && !caption.startsWith("[買い]")
            && !caption.startsWith("[売り]")
            && value.indexOf("リアルタイム株価") < 0) {

          //
          // TODO: unknown data format
          //
          System.out.println("unknown caption=" + caption);
          System.out.println("unknown value=" + value);
        }
      }
    }
  }