public String classifyUrl(String pageURL) { try { URL url = new URL(pageURL); Proxy proxy = new Proxy(Proxy.Type.HTTP, new InetSocketAddress("202.141.80.22", 3128)); Authenticator authenticator = new Authenticator() { public PasswordAuthentication getPasswordAuthentication() { return (new PasswordAuthentication("b.revanth", "batman9903".toCharArray())); } }; Authenticator.setDefault(authenticator); URLConnection urlConnection = url.openConnection(proxy); urlConnection.connect(); String line = null; StringBuffer webPageBuffer = new StringBuffer(); BufferedReader inputReader = new BufferedReader(new InputStreamReader(urlConnection.getInputStream())); while ((line = inputReader.readLine()) != null) { webPageBuffer.append(line); } Document document = Jsoup.parse(String.valueOf(webPageBuffer), "UTF-8"); Elements title = document.select("title"); Elements body = document.select("body"); Log.i("Now Classifying ", pageURL); String assignedClass = classifyText(title.text() + "\n" + body.text()); return assignedClass; } catch (IOException e) { Log.i("Error:", e.toString()); e.printStackTrace(); } return "N/A"; }
public Holder doParse(String html, String url) { Holder holder = new Holder(); holder.url = url; Document doc = Jsoup.parse(html, url); Elements typeElement = doc.select("body > div.main_w.clearfix > div.main.clearfix > ul > li:nth-child(5) > a"); holder.dishType = typeElement.text(); Elements titleElement = doc.select( "body > div.main_w.clearfix > div.main.clearfix > div.cp_header.clearfix > div.cp_main_info_w > div.info1 > h1 > a"); holder.title = titleElement.text(); Elements methodElement = doc.select( "body > div.main_w.clearfix > div.main.clearfix > div.cp_header.clearfix > div.cp_main_info_w > div.info2 > ul > li:nth-child(1) > a"); holder.method = methodElement.text(); Elements materialElement = doc.select( "body > div.main_w.clearfix > div.main.clearfix > div.cp_body.clearfix > div.cp_body_left > div.materials > div > div.yl.zl.clearfix > ul > li > div > h4 > a"); holder.mainMaterial = materialElement.text(); Elements stepE = doc.select( "body > div.main_w.clearfix > div.main.clearfix > div.cp_body.clearfix > div.cp_body_left > div.measure > div.editnew.edit > div.content.clearfix"); // // body > div.main_w.clearfix > div.main.clearfix > div.cp_body.clearfix // > div.cp_body_left > div.measure > div.editnew.edit > // div.content.clearfix // body > div.main_w.clearfix > div.main.clearfix > div.cp_body.clearfix // > div.cp_body_left > div.measure > div.edit > p:nth-child(1) > em // if (stepE.size() == 0) { stepE = doc.select( "body > div.main_w.clearfix > div.main.clearfix > div.cp_body.clearfix > div.cp_body_left > div.measure > div.edit > p"); } for (int i = 0; i < stepE.size(); i++) { Element e = stepE.get(i); if (e.children().hasClass("step")) { String step = e.text(); if (!"".equals(step)) { holder.steps.add(step); } } } // body > div.main_w.clearfix > div.main.clearfix > div.cp_body.clearfix // > div.cp_body_left > div.measure > div.editnew.edit > // div:nth-child(1) return holder; }
// Busca os endereços pelo número do CEP. public List<Address> getByCep(String cep) throws IOException { listEnderecos = new ArrayList<Address>(); // mapeamento dos parametros que será passado na requisição Map<String, String> query = new HashMap<String, String>(); query.put("CEP", cep); query.put("Metodo", "listaLogradouro"); query.put("TipoConsulta", "cep"); query.put("StartRow", "1"); query.put("EndRow", "10"); // Faz uma requisição no site do correios (www.buscacep.com.br) com Json, passando os parametros // mapeados, // requisição deverá ser do tipo post. // Armazena o retorno em uma variavel doc. Document doc = Jsoup.connect(Utils.adressCorreios) .data(query) .header("Origin", "http://www.buscacep.correios.com.br") .header("Referer", "http://www.buscacep.correios.com.br") .post(); // Acessa o retorno do doc e percorre o resultado buscando as informações dos endereços // Armazena os resultados na lista de endereços criadas e retorna a mesma para que outras // classes possam acessar. Elements elements = doc.select("table").eq(2); Elements rows = elements.select("tr"); Iterator<Element> rowIterator = rows.iterator(); while (rowIterator.hasNext()) { Address enderecos = new Address(); Element element = rowIterator.next(); Elements logradouro = element.children().select("td").eq(0); enderecos.setLogradouro(logradouro.text()); Elements bairro = element.children().select("td").eq(1); enderecos.setBairro(bairro.text()); Elements cidade = element.children().select("td").eq(2); Elements estado = element.children().select("td").eq(3); StringBuilder sbLocalidade = new StringBuilder(); sbLocalidade.append(cidade.text()); sbLocalidade.append("/"); sbLocalidade.append(estado.text()); enderecos.setLocalidade(sbLocalidade.toString()); Elements codigopostal = element.children().select("td").eq(4); enderecos.setCEP(codigopostal.text()); listEnderecos.add(enderecos); } return listEnderecos; }
private int getPageNum() { setCurrentPage(1); Document doc = WebScrappUtil.post(url, argMap); Elements aa = doc.select("p[class=page_info]"); String[] pageString = aa.text().split("]"); logger.info("aa:{}", aa.text()); int indexname = pageString[0].lastIndexOf("/"); String pageNum = pageString[0].substring(indexname + 1, pageString[0].length()); return Integer.valueOf(pageNum); }
@Override public Group call() throws Exception { Group result = null; // Gets every URL you send in URL url = new URL(currentUrl); // Reads what the url contains LineNumberReader in = new LineNumberReader(new InputStreamReader(url.openStream())); // Using Jsoup to scrape the data from the urls Document doc = Jsoup.connect(currentUrl).get(); Elements currAuthors = doc.select("#authors"); String authors = currAuthors.text(); Elements currClass = doc.select("#class"); String myClass = currClass.text(); Elements currGroup = doc.select("#group"); String group = currGroup.text(); // Simple sout to check if i got the correct data out // System.out.println("authors: " + authors); // System.out.println("class: " + myClass); // System.out.println("group: " + group); try { String line = null; while ((line = in.readLine()) != null) { Group myGroup = new Group(); // Adds the data to myGroup object myGroup.setAuthors(authors); myGroup.setMyClass(myClass); myGroup.setGroup(group); // System.out.println("\nGroup sout:"); // System.out.println("Group" + myGroup.getAuthors()); // System.out.println("Group" + myGroup.getGroup()); // System.out.println("Group" + myGroup.getMyClass()); result = myGroup; // result = "\nAuthors: " + authors + " Class: " + myClass + " Group: " + group + " \n--- // from group class: " // + myGroup.getAuthors() + " " + myGroup.getGroup() + " " + myGroup.getMyClass() + // "\n"; } } finally { in.close(); } // System.out.println(currentUrl + result.getGroup()); return result; }
private String parseReplace(String query) { String value = getValue(query, REPLACE_TAG); String[] pairs = value.split("##"); outputNodeInfo(); String result = elements.text(); for (String pair : pairs) { if (!isUniqueValue(pair, "=")) { throw new IllegalArgumentException("Argument selector part: " + query + " is illegal"); } if (pair.indexOf("=") == 0) { throw new IllegalArgumentException("Argument selector part: " + query + " is illegal"); } String[] pairMap = pair.split("="); String arg1 = null; String arg2 = null; if (pairMap.length == 1) { arg1 = replaceSpacePlaceHolder(replaceEqualPlaceHolder(pairMap[0])); arg2 = ""; } else { arg1 = replaceSpacePlaceHolder(replaceEqualPlaceHolder(pairMap[0])); arg2 = replaceSpacePlaceHolder(replaceEqualPlaceHolder(pairMap[1])); } result = result.replace(arg1, arg2); } return result; }
@Override public Article run(HtmlObject htmlObject) { String html = htmlObject.getHtml(); Document doc = Jsoup.parse(html); String title = doc.select(".article h1").text(); Elements contentElement = doc.select(".article_con"); String content = ""; String contentHtml = ""; if (contentElement != null) { // contentElement.select(".author").remove(); content = contentElement.text(); contentHtml = contentElement.html(); } String Ele_data = doc.select(".article h2").text(); Matcher m1 = datePattern.matcher(Ele_data); String date = ""; if (m1.find()) { date = m1.group(1); } else { Date today = new Date(); SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd"); date = formatter.format(today); } Article model1 = new Article(); model1.setUrl(htmlObject.getUrl()); model1.setTitle(title); model1.setContent(content); model1.setPublishDate(date); model1.setArticleType(ArticleType.News); model1.setProvider("雨果网"); return model1; }
/** * achieve the num of people him/her fellowed * * @param doc * @return */ private String getFellowPeopleNum(Document doc) { Elements friendHtml = doc.select("div[id=\"friend\"]"); Elements fellowPeopleNumHtml = null; if (friendHtml != null) { fellowPeopleNumHtml = friendHtml.select("a"); // 关注人数 if (fellowPeopleNumHtml != null) { String fellowPeopleNum = UtilsMethod.findFirstStringByRegex("成员[0-9]+", fellowPeopleNumHtml.text()); if (fellowPeopleNum != null) { fellowPeopleNum = fellowPeopleNum.replaceAll("[\\D]+", ""); if (fellowPeopleNum != null) { return fellowPeopleNum; } else { return null; } } else { return null; } } else { return null; } } else { return null; } }
public void setContent(String content) { this.content = content; Document doc = Jsoup.parse(content); Elements docTitle = doc.select("title"); if (docTitle.hasText()) { this.title = docTitle.text(); } }
private boolean updateDailyNews(Document doc, String dailyTitle, DailyNews dailyNews) throws JSONException { Elements viewMoreElements = doc.getElementsByClass("view-more"); if (viewMoreElements.size() > 1) { dailyNews.setMulti(true); Elements questionTitleElements = doc.getElementsByClass("question-title"); for (int j = 0; j < viewMoreElements.size(); j++) { if (questionTitleElements.get(j).text().length() == 0) { dailyNews.addQuestionTitle(dailyTitle); } else { dailyNews.addQuestionTitle(questionTitleElements.get(j).text()); } Elements viewQuestionElement = viewMoreElements.get(j).select("a"); if (viewQuestionElement.text().equals("查看知乎讨论")) { dailyNews.addQuestionUrl(viewQuestionElement.attr("href")); } else { return false; } } } else if (viewMoreElements.size() == 1) { dailyNews.setMulti(false); Elements viewQuestionElement = viewMoreElements.select("a"); if (viewQuestionElement.text().equals("查看知乎讨论")) { dailyNews.setQuestionUrl(viewQuestionElement.attr("href")); } else { return false; } // Question title is the same with daily title if (doc.getElementsByClass("question-title").text().length() == 0) { dailyNews.setQuestionTitle(dailyTitle); } else { dailyNews.setQuestionTitle(doc.getElementsByClass("question-title").text()); } } else { return false; } return true; }
/** * achieve the person name on douban * * @param doc * @return */ private String getName(Document doc) { Elements nameHtml = doc.select("title"); // 姓名 if (nameHtml != null) { return nameHtml.text(); } else { return null; } }
/** * achieve the person join douban date * * @param doc * @return */ private String getJoinDate(Document doc) { Elements joinDateHtml = doc.select("div[class=\"user-info\"] div[class=\"pl\"]"); if (joinDateHtml != null) { return UtilsMethod.findFirstStringByRegex( "[0-9]{4}\\-[0-9]{2}\\-[0-9]{2}", joinDateHtml.text()); } else { return null; } }
@Override public NewsEntity parseNewsPerCategory(String newsURL) { // TODO Auto-generated method stub NewsEntity parsetData = null; try { Document doc = Jsoup.connect(newsURL).timeout(Constants.MAX_DELAY_TIME * 1000).get(); doc.outputSettings().charset(Charset.forName("UTF-8")); doc.normalise(); Elements titleElement = doc.select("title"); String titleName = titleElement.text(); if (titleName.contains("|")) { titleName = titleName.substring(0, titleName.indexOf("|")).trim(); } Elements newsElements = doc.select("div[class=content]").select("p"); newsElements.select("a, img, script, xml, input, label, textarea").remove(); if (newsElements != null) { try { parsetData = new NewsEntity(); parsetData.setNewsTitle(titleName); parsetData.setNewsBody(newsElements.text()); parsetData.setNewsURL(newsURL); // System.out.println("URL: " + newsURL + " HASH: " + // NewsAggregatorUtility.StringToSHA1Hash(newsURL)); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } } } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } return parsetData; }
public static void main(String[] args) { // String my_url_to_fetch = // "http://www.cdiscount.com/electromenager/tous-nos-accessoires/joint-hublot-d-30-30-cm/f-11029-ind3662734065501.html#mpos=2|mp"; // String my_url_to_fetch = // "http://www.cdiscount.com/le-sport/vetements-de-sport/kappa-survetement-armor-homme/f-121020526-3025ej0005.html#mpos=1|cd"; // String my_url_to_fetch = // "http://www.cdiscount.com/animalerie/chiens/lot-de-3-sofas-pour-chien/f-1621004-ifd19945rouge.html"; // String my_url_to_fetch = "http://www.cdiscount.com/telephonie/r-housse+guidon.html#_his_"; String my_url_to_fetch = "http://www.cdiscount.com/maison/tapis/rio-tapis-shaggy-anthracite-30-mm-160x230-cm/f-1172512-r252an160230.html"; // fetching data using jQuery org.jsoup.nodes.Document doc; try { // we wait between 30 and 70 seconds doc = Jsoup.connect(my_url_to_fetch) .userAgent( "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-GB; rv:1.9.2.13) Gecko/20101203 Firefox/3.6.13 (.NET CLR 3.5.30729)") .referrer("accounterlive.com") .ignoreHttpErrors(true) .timeout(0) .get(); Elements titleel = doc.select("title"); System.out.println(titleel.text()); doc = Jsoup.connect(my_url_to_fetch) .userAgent( "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-GB; rv:1.9.2.13) Gecko/20101203 Firefox/3.6.13 (.NET CLR 3.5.30729)") .referrer("accounterlive.com") .ignoreHttpErrors(true) .timeout(0) .get(); Elements titleel2 = doc.select("title"); System.out.println(titleel2.text()); } catch (IOException e) { e.printStackTrace(); } }
@Override protected Void doInBackground(Void... params) { try { Document doc = Jsoup.connect(link).ignoreContentType(true).get(); Elements titles = doc.select("span.title2"); title = titles.text(); Elements image = doc.select("img.news-record-thumbnail"); img_src = image.attr("src"); Elements p = doc.select("div.news-block-justify").select("p"); for (Element item : p) { description_text += item.text() + "\n\n"; } Elements date = doc.select("span.title"); DateAdded = date.text(); } catch (Exception ex) { ex.printStackTrace(); } return null; }
@SuppressWarnings("unchecked") public List<String> selectListResult(String separator) { Object result = getResult(resultQuery); if (result instanceof Elements) { Elements eles = (Elements) result; if (eles.size() > 1) { return Arrays.asList(eles.text().split(DEFAULT_SEPARATOR)); } if (eles.size() == 1 && !StringUtils.isEmpty(separator)) { return Arrays.asList(eles.first().text().split(separator)); } return ListKit.of(eles.text()); } else if (result instanceof String) { String string = (String) result; return Arrays.asList(string.split(separator)); } else if (result instanceof List) { return (List<String>) result; } return null; }
@Override protected List<String> getProductBrandFromHTML(Document doc) throws Exception { List<String> list = new ArrayList<String>(); Elements brand = doc.select("div.main-product") .select("div.detail-column") .select("h1.component") .select("a"); String brandOfproduct = brand.text().toLowerCase().trim(); list.add(brandOfproduct); return list; }
public String leituraJxr() throws IOException { // método para pegar os nomes dos métodos declarados Elements elements = document.getElementsByTag("pre"); elements.select("a.jxr_linenumber").remove(); // elements.select("strong.jxr_keyword").remove(); // elements.select("span.jxr_string").remove(); // elements.select("em.jxr_comment").remove(); for (Element children : elements) { children.getElementsByClass("jxr_comment").remove(); children.getElementsByClass("jxr_javadoccomment").remove(); } return elements.text(); // retorna o código sem lixo }
// Busca o Cep pelo logradouro. public List<String> getByAdress(String address) throws IOException { listAddress = new ArrayList<String>(); // mapeamento dos parametros que será passado na requisição Map<String, String> query = new HashMap<String, String>(); query.put("relaxation", address); query.put("TipoCep", "ALL"); query.put("semelhante", "N"); query.put("cfm", "1"); query.put("Metodo", "listaLogradouro"); query.put("TipoConsulta", "relaxation"); query.put("StartRow", "1"); query.put("EndRow", "10"); // Faz uma requisição no site do correios (www.buscacep.com.br) com Json, passando os parametros // mapeados, // requisição deverá ser do tipo post. // Armazena o retorno em uma variavel doc. Document doc = Jsoup.connect(Utils.adressCorreios) .timeout(20000) .data(query) .header("Origin", "http://www.buscacep.correios.com.br") .header("Referer", "http://www.buscacep.correios.com.br") .post(); // Acessa o retorno do doc e percorre o resultado buscando as informações de Cep de acordo com o // endereço passado. // Armazena os resultados na lista criada e retorna a mesma para que outras classes possam // acessar Elements elements = doc.select("table").eq(2); Elements rows = elements.select("tr"); Iterator<Element> rowIterator = rows.iterator(); while (rowIterator.hasNext()) { Address enderecos = new Address(); Element element = rowIterator.next(); Elements codigopostal = element.children().select("td").eq(4); enderecos.setCEP(codigopostal.text()); listAddress.add(enderecos.getCEP()); } return listAddress; }
public static void main(String[] args) throws Exception { // 第一步:访问页面 String url = "http://www.huxiu.com/article/102062/1.html"; Document document = Jsoup.connect(url).get(); // 第二步:解析页面 Elements titleElements = document.getElementsByTag("title"); String title = titleElements.get(0).text(); Elements elements = document.select("div #article_content"); String content = elements.text(); // 第三步:打印 System.out.println("title:" + title); System.out.println("content:" + content); }
private void getDatafromJsoup(String url) { // TODO Auto-generated method stub try { Document doc = Jsoup.connect(url).get(); // Elements content = doc.getElementsByClass("cell item"); Elements header = doc.getElementsByClass("topic_content"); Log.e("topic_content", header.text()); title = header.text(); Elements content = doc.getElementsByTag("tbody"); for (Element link : content) { DetailEntity entity = new DetailEntity(); Elements avatar = link.getElementsByTag("img"); { String avaterLink = avatar.attr("src"); if (avaterLink.startsWith("//cdn.")) { entity.setAvater("http:" + avaterLink); } } Elements reply_content = link.getElementsByClass("reply_content"); Log.e("reply_content", reply_content.text()); entity.setReply_count(reply_content.text()); Elements title = link.getElementsByTag("a"); if (title.attr("href").startsWith("/member/")) { Log.e("title", title.text()); entity.setTitle(title.text()); } Log.e( "other", link.getElementsByClass("fade small").text() + link.getElementsByClass("small fade").text()); if (!TextUtils.isEmpty(reply_content.text())) entities.add(entity); } } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } }
/** * achieve the num of people fellow him/her * * @param doc * @return */ private String getFellowedPeopleNum(Document doc) { Elements friendHtml = doc.select("p[class=\"rev-link\"]"); if (friendHtml != null) { String fellowedPeopleNum = UtilsMethod.findFirstStringByRegex("被[0-9]+人关注", friendHtml.text()); if (fellowedPeopleNum != null) { fellowedPeopleNum = fellowedPeopleNum.replaceAll("[\\D]+", ""); if (fellowedPeopleNum != null) { return fellowedPeopleNum; } else { return null; } } else { return null; } } else { return null; } }
@SuppressWarnings("unchecked") public <T> T selectResult(Class<T> clazz) { if (clazz == null) { throw new IllegalArgumentException("Argument clazz cannot be null"); } Object result = getResult(resultQuery); String string = ""; if (result instanceof Elements) { Elements eles = (Elements) result; string = eles.text(); } else if (result instanceof String) { string = (String) result; } if (String.class.equals(clazz)) { return (T) string; } else if (Integer.class.equals(clazz)) { string = NumberKit.removeNotNumber(string); } return new Gson().fromJson(string, clazz); }
/** * get want to listen music list * * @param obssinHtml * @return */ private List<String> getWantMusic(Elements obssinHtml) { if (obssinHtml == null) { return null; } for (Element books : obssinHtml) { Elements substatus = books.select("div[class=\"substatus\"]"); if (substatus.text().equals("想听")) { Elements wantMusicHtml = books.select("li[class=\"aob\"] a"); if (wantMusicHtml != null) { List<String> wantMusic = new ArrayList<String>(); for (Element music : wantMusicHtml) { wantMusic.add(music.attr("title")); } return wantMusic; } else { return null; } } } return null; }
/** * get the movie which watched * * @param obssinHtml * @return */ private List<String> getWatchedMovie(Elements obssinHtml) { if (obssinHtml == null) { return null; } for (Element books : obssinHtml) { Elements substatus = books.select("div[class=\"substatus\"]"); if (substatus.text().equals("看过")) { Elements watchedMovieHtml = books.select("li[class=\"aob\"] a"); if (watchedMovieHtml != null) { List<String> watchedMovie = new ArrayList<String>(); for (Element movie : watchedMovieHtml) { watchedMovie.add(movie.attr("title")); } return watchedMovie; } else { return null; } } } return null; }
/** * get the want to read book list * * @param obssinHtml * @return */ private List<String> getWantBook(Elements obssinHtml) { if (obssinHtml == null) { return null; } for (Element books : obssinHtml) { Elements substatus = books.select("div[class=\"substatus\"]"); if (substatus.text().equals("想读")) { Elements wantBookHtml = books.select("li[class=\"aob\"] a img"); if (wantBookHtml != null) { List<String> wantBook = new ArrayList<String>(); for (Element book : wantBookHtml) { wantBook.add(book.attr("alt")); } return wantBook; } else { return null; } } } return null; }
/** * 检查返回内容是否错误 * * @param html * @throws RespUrlException * @throws ResourceNotFountException */ private void checkRespHaveAlertError(String html) throws IOException { Document doc = Jsoup.parse(html); Elements htmlEle = doc.select("html"); String key = null; if (!htmlEle.isEmpty()) { key = doc.select("html").first().attr("xmlns:wb"); } Elements eles = doc.select("div.alert-error"); String errMsg = eles.text(); if (StringUtils.indexOf(errMsg, "影片暂时不可以访问") != -1) { // 链接资源错误 throw new ResourceNotFountException(errMsg); } else if (StringUtils.indexOf(errMsg, "遇到一个错误了") != -1) { throw new ResourceNotFountException(errMsg); } else { if (!eles.isEmpty()) { throw new RespUrlException("请求页面结果错误"); } else if (!"http://open.weibo.com/wb".equals(key)) { throw new RespUrlException("代理返回结果错误"); } } }
/** * Parsing existing entities * * @param pEntity */ public void parseIndividualEnt(PersistentEntity pEntity) { Document doc = null; Iterator<Entry<Utf8, Utf8>> it = pEntity.getSameAs().entrySet().iterator(); while (it.hasNext()) { Map.Entry<Utf8, Utf8> pairs = (Map.Entry<Utf8, Utf8>) it.next(); // Reading individual URLs LOGGER.info( "Parsing entity from: " + ParserUtils.getUri(pairs.getKey().toString()).toASCIIString()); doc = ParserUtils.connectGetUrl(ParserUtils.getUri(pairs.getKey().toString()).toASCIIString()); if (doc == null && !validateSite(doc)) { break; } else { doc.setBaseUri(VejaSaoPauloParser.DEFAULT_VSP_URL); StringBuilder strBuilder = new StringBuilder(); // getting working hours Elements workElems = doc.select("div[class*=information-unwanted]").select("div[class*=working-hours]"); if (workElems != null && workElems.size() > 0) { for (Element info : workElems.select("div[class*=hours]").select("p")) strBuilder.append(info.text().replace("-", "_")).append(ParserProperties.INFO_SEP); pEntity.setSchedule(new Utf8(strBuilder.toString())); } // getting price range workElems = doc.select("div[class*=information-unwanted]") .select("div[class*=price]") .select("p[class*=price-range]"); strBuilder.delete(0, strBuilder.length()); if (workElems != null && workElems.size() > 0) { strBuilder.append( doc.select("div[class*=price]").select("h3").first().text() + ParserProperties.DESC_SEP); strBuilder.append(workElems.text()); pEntity.addToExtraInfo(new Utf8(strBuilder.toString())); // LOGGER.debug(strBuilder.toString()); } // getting payment information workElems = doc.select("div[class*=information-unwanted]") .select("div[class*=payment]") .select("p"); strBuilder.delete(0, strBuilder.length()); if (workElems != null && workElems.size() > 0) { strBuilder.append( doc.select("div[class*=payment]").select("h3").first().text() + ParserProperties.DESC_SEP); for (Element infoElem : workElems) if (!infoElem.text().trim().equals("")) { strBuilder.append(infoElem.text().trim() + ParserProperties.INFO_SEP); } } // END-IF_PAYMENT // getting services provided information workElems = doc.select("div[class*=information-unwanted]") .select("div[class*=services]") .select("div[class*=information-services]") .select("p"); strBuilder.delete(0, strBuilder.length()); if (workElems != null && workElems.size() > 0) { for (Element infoElem : workElems) { if (infoElem.hasClass("observation")) { pEntity.addToExtraInfo(new Utf8("Observation :" + infoElem.text())); } else if (!infoElem.text().equals("")) { pEntity.addToServices(new Utf8(infoElem.text())); } // LOGGER.debug(infoElem.text()); } } // END-IF_SERVICES // getting home url workElems = doc.select("div[class*=information-unwanted]").select("div[class*=website]"); if (workElems != null && workElems.size() > 0) { EylloLink homeLink = ParserUtils.detectUrl( workElems .select("div[class*=information-website]") .select("p") .select("a") .first()); if (homeLink != null) { pEntity.setHomepage(new Utf8(homeLink.getLinkHref())); pEntity.putToSameAs(new Utf8(homeLink.getLinkHref()), new Utf8(homeLink.getLinkText())); } } // END-IF_URL pEntity.setDescription(new Utf8("")); } // END-IF_VALID_URL } // END-WHILE }
private void doPost(String url, String replyText, String subject, String typeid) { String formhash = mInfo != null ? mInfo.getFormhash() : null; if (TextUtils.isEmpty(formhash)) { mResult = "发表失败,无法获取必要信息 !"; mStatus = Constants.STATUS_FAIL; return; } Map<String, String> post_param = new HashMap<>(); post_param.put("formhash", formhash); post_param.put("posttime", String.valueOf(System.currentTimeMillis())); post_param.put("wysiwyg", "0"); post_param.put("checkbox", "0"); post_param.put("message", replyText); for (String attach : mInfo.getAttaches()) { post_param.put("attachnew[" + attach + "][description]", attach); } for (String attach : mInfo.getAttachdel()) { post_param.put("attachdel[" + attach + "]", attach); } for (String attach : mInfo.getUnusedImages()) { post_param.put("attachdel[" + attach + "]", attach); } if (mMode == MODE_NEW_THREAD) { post_param.put("subject", subject); post_param.put("attention_add", "1"); mTitle = subject; } else if (mMode == MODE_EDIT_POST) { if (!TextUtils.isEmpty(subject)) { post_param.put("subject", subject); mTitle = subject; if (!TextUtils.isEmpty(typeid)) { post_param.put("typeid", typeid); } } } SimpleErrorListener errorListener = VolleyHelper.getInstance().getErrorListener(); String rsp_str = VolleyHelper.getInstance().synchronousPost(url, post_param, errorListener); // when success, volley will follow 302 redirect get the page content if (!TextUtils.isEmpty(rsp_str)) { String tid = ""; if (rsp_str.contains("tid = parseInt('")) { tid = HttpUtils.getMiddleString(rsp_str, "tid = parseInt('", "'"); } if (!TextUtils.isEmpty(tid) && TextUtils.isDigitsOnly(tid) && Integer.parseInt(tid) > 0 && !rsp_str.contains("alert_info")) { mTid = tid; mResult = "发表成功!"; mStatus = Constants.STATUS_SUCCESS; } else { Logger.e(rsp_str); mResult = "发表失败! "; mStatus = Constants.STATUS_FAIL; Document doc = Jsoup.parse(rsp_str); Elements error = doc.select("div.alert_info"); if (error != null && error.size() > 0) { mResult += error.text(); } else { if (HiSettingsHelper.getInstance().isErrorReportMode()) ACRAUtils.acraReport("Error when posting but with response", rsp_str); } } } else { Logger.e(errorListener.getError()); mResult = "发表失败,无返回结果! " + errorListener.getErrorText(); mStatus = Constants.STATUS_FAIL; if (HiSettingsHelper.getInstance().isErrorReportMode()) ACRAUtils.acraReport(errorListener.getError(), "no response"); } }
private void extractDataAsString(Document doc) throws FailedToFindElementException { Elements infoElements = doc.select(CSS_QUERY_TO_FIND_STOCKS_INFO); if (infoElements == null || infoElements.size() < 1) { throw new FailedToFindElementException("Cannot find stock info element."); } Element info = infoElements.get(0); if (info != null) { Elements dts = info.select("dt"); if (dts != null) { stockCodeStr = dts.text().trim(); } Elements category = info.select(".category"); if (category != null) { sectorStr = category.text().trim(); } } Elements stocksTables = doc.select(CSS_QUERY_TO_FIND_STOCKS_TABLE); if (stocksTables == null || stocksTables.size() < 1) { throw new FailedToFindElementException("Cannot find stock table element."); } Element stocksTable = stocksTables.get(0); Elements symbol = stocksTable.select(".symbol"); if (symbol != null) { stockNameStr = symbol.text().trim(); } Elements tds = stocksTable.select("td"); for (Element td : tds) { String text = Util.normalizeRoundParentheses(td.text().trim()); if (text.length() == 0) { } else if (td.classNames().contains("change")) { priceComparisonWithPreviousDayStr = text; } else { try { Double.parseDouble(Util.removeCommaAndNbsp(text)); } catch (NumberFormatException e) { continue; } realtimePriceStr = text; } } Elements spans = doc.select(CSS_QUERY_IN_DETAIL_PAGE_TO_FIND_SPAN_UNDER_ID_DEAL); // マザーズ,札証,札幌ア,東証,東証1部, 東証2部,東証JQG,東証JQS,東証外国,福岡Q, 福証 for (Element span : spans) { String s = span.text().trim(); if (s.length() > 0 && (s.indexOf("マ") >= 0 || s.indexOf("札") >= 0 || s.indexOf("東") >= 0 || s.indexOf("福") >= 0)) { int index = s.indexOf('('); if (index >= 0) { s = s.substring(0, index); } marketStr = s; break; } } if (marketStr == null) { for (Element span : spans) { String s = span.text().trim(); if (s.length() > 0 && !s.startsWith("(") && !s.equals("PTS") && !s.equals("OTC")) { int index = s.indexOf('('); if (index >= 0) { s = s.substring(0, index); } marketStr = s; break; } } } boolean isDebt = false; boolean isSelling = false; Elements dls = doc.select(CSS_QUERY_IN_DETAIL_PAGE_TO_FIND_ALL_DL); for (Element dl : dls) { Elements dt = dl.getElementsByTag("dt"); Elements dd = dl.getElementsByTag("dd"); String caption = dt.text().trim(); String value = Util.normalizeRoundParentheses(dd.text().trim()); if (caption.startsWith(CAPTION_PREVIOUS_CLOSING_PRICE)) { previousClosingPriceStr = value; } else if (caption.startsWith(CAPTION_OPENING_PRICE)) { openingPriceStr = value; } else if (caption.startsWith(CAPTION_HIGH_PRICE)) { highPriceStr = value; } else if (caption.startsWith(CAPTION_LOW_PRICE)) { lowPriceStr = value; } else if (caption.startsWith(CAPTION_TRADING_VOLUME_OF_STOCKS)) { tradingVolumeOfStocksStr = value; } else if (caption.startsWith(CAPTION_TRADING_VALUE_OF_MONEY)) { tradingValueOfMoneyStr = value; } else if (caption.startsWith(CAPTION_PRICE_LIMIT)) { priceLimitStr = value; } else if (caption.startsWith(CAPTION_MARKET_CAPITALIZATION)) { marketCapitalizationStr = value; } else if (caption.startsWith(CAPTION_OUTSTANDING_STOCK_VOLUME)) { outstandingStockVolumeStr = value; } else if (caption.startsWith(CAPTION_ANNUAL_INTEREST_RATE)) { annualInterestRateStr = value; } else if (caption.startsWith(CAPTION_DIVIDENDS_PER_SHARE)) { dividendsPerShareStr = value; } else if (caption.startsWith(CAPTION_PER)) { perStr = value; } else if (caption.startsWith(CAPTION_PBR)) { pbrStr = value; } else if (caption.startsWith(CAPTION_EPS)) { epsStr = value; } else if (caption.startsWith(CAPTION_BPS)) { bpsStr = value; } else if (caption.startsWith(CAPTION_MINIMUM_PURCHASE_AMOUNT)) { minimumPurchaseAmountStr = value; } else if (caption.startsWith(CAPTION_SHARE_UNIT_NUMBER)) { shareUnitNumberStr = value; } else if (caption.startsWith(CAPTION_YEARLY_HIGH)) { yearlyHighStr = value; } else if (caption.startsWith(CAPTION_YEARLY_LOW)) { yearlyLowStr = value; } else if (caption.startsWith(CAPTION_NET_ASSETS)) { netAssetsStr = value; } else if (caption.startsWith(CAPTION_UNIT_OF_TRADING)) { unitOfTradingStr = value; } else if (caption.startsWith(CAPTION_MANAGEMENT_COMPANY)) { managementCompanyStr = value; } else if (caption.startsWith(CAPTION_TYPE_OF_ASSETS_TO_BE_INVESTED)) { typeOfAssetsToBeInvestedStr = value; } else if (caption.startsWith(CAPTION_REGION_TO_BE_INVESTED)) { regionToBeInvestedStr = value; } else if (caption.startsWith(CAPTION_UNDERLYING_INDEX)) { underlyingIndexStr = value; } else if (caption.startsWith(CAPTION_SETTLEMENT_FREQUENCY)) { settlementFrequencyStr = value; } else if (caption.startsWith(CAPTION_SETTLEMENT_MONTH)) { settlementMonthStr = value; } else if (caption.startsWith(CAPTION_LISTED_DATE)) { listedDateStr = value; } else if (caption.startsWith(CAPTION_TRUST_FEE)) { trustFeeStr = value; } else if (caption.startsWith(CAPTION_MARGIN_DEBT_BALANCE)) { marginDebtBalanceStr = value; isDebt = true; isSelling = false; } else if (caption.startsWith(CAPTION_MARGIN_RATIO_COMPARISON_WITH_PREVIOUS_WEEK)) { if (isDebt) { marginDebtBalanceRatioComparisonWithPreviousWeekStr = value; } else if (isSelling) { marginSellingBalanceRatioComparisonWithPreviousWeekStr = value; } } else if (caption.startsWith(CAPTION_MARGIN_SELLING_BALANCE)) { marginSellingBalanceStr = value; isDebt = false; isSelling = true; } else if (caption.startsWith(CAPTION_RATIO_OF_MARGIN_BALANCE)) { ratioOfMarginBalanceStr = value; } else { if (!caption.equals("") && !caption.startsWith("値上がり率") && !caption.startsWith("値下がり率") && !caption.startsWith("[買い]") && !caption.startsWith("[売り]") && value.indexOf("リアルタイム株価") < 0) { // // TODO: unknown data format // System.out.println("unknown caption=" + caption); System.out.println("unknown value=" + value); } } } }