@Override public HSDeck getDeckDetail(final HSDeck hsDeck, final float n) { try { final Document value = Jsoup.connect(HPDeckSource.BASE_URL + hsDeck.getUrl()).get(); final Elements select = value.select("section.class-listing table.listing td.col-name"); final HashMap<String, String> classHsItemMap = new HashMap<String, String>(); final ArrayList<String> list = new ArrayList<String>(); for (int i = 0; i < select.size(); ++i) { final String text = select.get(i).select("a").get(0).text(); classHsItemMap.put( text, select.get(i).text().trim().substring(select.get(i).text().trim().length() - 1)); list.add(text); } hsDeck.setClassHsItemMap(classHsItemMap); hsDeck.setClassHsItemList(DataBaseManager.getInstance().getAllCardsByNames(list)); final Elements select2 = value.select("section.neutral-listing table.listing td.col-name"); final HashMap<String, String> neutralHsItemMap = new HashMap<String, String>(); final ArrayList<String> list2 = new ArrayList<String>(); for (int j = 0; j < select2.size(); ++j) { final String text2 = select2.get(j).select("a").get(0).text(); neutralHsItemMap.put( text2, select2.get(j).text().trim().substring(select2.get(j).text().trim().length() - 1)); list2.add(text2); } hsDeck.setNeutralHsItemMap(neutralHsItemMap); hsDeck.setNeutralHsItemList(DataBaseManager.getInstance().getAllCardsByNames(list2)); hsDeck.setDescription( HtmlHelper.parseDescription(value.select("div.deck-description").html(), n, false)); return hsDeck; } catch (IOException ex) { ex.printStackTrace(); return hsDeck; } }
@Override protected void parseRow( final String query, final int options, final Element tr, final List<Name> results) { final String thumbnailUrl = tr.getElementsByAttributeValue("class", "primary_photo") .first() .getElementsByTag("img") .first() .attr("src"); final Element r = tr.getElementsByAttributeValue("class", "result_text").first(); final Element a = r.getElementsByTag("a").first(); final String url = Imdb.BASE_URL + a.attr("href"); final String name = a.ownText(); String job = ""; Reference ref = null; final Elements smalls = r.getElementsByTag("small"); if (!smalls.isEmpty()) { final String refUrl = Imdb.BASE_URL + smalls.first().getElementsByTag("a").first().attr("href"); String desc = smalls.first().text(); if (desc.startsWith("(") && desc.endsWith(")")) desc = desc.substring(1, desc.length() - 1); final int comma = desc.indexOf(','); if (comma != -1) { job = desc.substring(0, comma).trim(); ref = new Reference(refUrl, desc.substring(comma + 1).trim()); } else { if (desc.matches(".+\\(\\d+\\)")) ref = new Reference(refUrl, desc.substring(comma + 1).trim()); else job = desc; } } results.add(new Name(url, thumbnailUrl, name, job, ref)); }
/** * achieve the num of people him/her fellowed * * @param doc * @return */ private String getFellowPeopleNum(Document doc) { Elements friendHtml = doc.select("div[id=\"friend\"]"); Elements fellowPeopleNumHtml = null; if (friendHtml != null) { fellowPeopleNumHtml = friendHtml.select("a"); // 关注人数 if (fellowPeopleNumHtml != null) { String fellowPeopleNum = UtilsMethod.findFirstStringByRegex("成员[0-9]+", fellowPeopleNumHtml.text()); if (fellowPeopleNum != null) { fellowPeopleNum = fellowPeopleNum.replaceAll("[\\D]+", ""); if (fellowPeopleNum != null) { return fellowPeopleNum; } else { return null; } } else { return null; } } else { return null; } } else { return null; } }
@Override public String fire(String inputContent) throws Exception { validate(); Document document = Jsoup.parse(inputContent); Elements elements = document.select(cssSelector); return (elements != null && elements.size() > 0 ? elements.html().trim() : null); }
@Override public void populateMetaData(MetaData metaData) throws MetaDataException { Document doc; try { if (method.equals("GET")) { doc = Jsoup.connect(url).get(); } else if (method.equals("POST")) { doc = Jsoup.connect(url).data(requestData).post(); } else { throw new MetaDataException("Unsupported HTML access method: " + method); } for (MetaDataAttribute attribute : attributes) { Elements elements = doc.select(attribute.getQuery()); if (elements.size() > 0) { String sValue = elements.get(0).text(); Object oValue = attribute.getValueMapper().parse(sValue); metaData.put(attribute.getName(), oValue); } } } catch (IOException e) { throw new MetaDataException(e); } catch (ValueMapperException e) { throw new MetaDataException(e); } }
public Chapter createChapter(int id, String page) { Chapter chapter = new Chapter(id); chapter.setUrl(Constants.BASE_URL + getVersion() + page); String cache = getCachePath() + page; try { String html = client.requestWithCache(chapter.getUrl(), cache, client.METHOD_GET, null); Document chapterDoc = Jsoup.parse(html); // 取出内容 Elements tables = chapterDoc.select("table"); int tableIndexOfMainBody = 1; if (tables.size() == 1) { tableIndexOfMainBody = 0; } Element table = chapterDoc.select("table").get(tableIndexOfMainBody); Elements sectionElements = table.select("td[class=v]"); logger.debug(sectionElements.size()); for (Element tdIndex : sectionElements) { Element tdContent = tdIndex.nextElementSibling(); String section = tdContent.text(); logger.debug(section); chapter.addSection(section); } } catch (IOException e) { logger.error(e.getMessage()); } catch (IndexOutOfBoundsException e) { logger.error(e.getMessage()); } return chapter; }
@Override protected void initialize(Element source) { Elements elements = source.getElementsByTag("td"); Element element = elements.get(0).select("[data-sc-params]").get(0); String name = element .attr("data-sc-params") .replaceAll("\\{ 'name': '", "") .replaceAll("', 'magnet':.*", "") .replaceAll("%20", "\\.") .replaceAll("%5B.*", ""); ShowData showData = ShowData.fromFilename(name); initialize(showData); seeds = Integer.parseInt(elements.get(4).text()); peers = Integer.parseInt(elements.get(5).text()); element = elements.get(0).select("div a[title=Download torrent file]").get(0); String[] array = element.attr("href").split("\\?"); downloadLink = array[0].replaceAll("\\.torrent", "/temp\\.torrent"); if (downloadLink.startsWith("//")) { downloadLink = "http:" + downloadLink; } }
@Override public void run() { // TODO Auto-generated method stub Document doc = null; Elements eles = null; if (!Utils.isNET(NewsContentActivity.this)) { Utils.showToast(NewsContentActivity.this, "网络不可用哦,亲!", Toast.LENGTH_SHORT); } else { try { doc = Jsoup.connect(url).timeout(8000).get(); if (null == doc) { Utils.showToast(NewsContentActivity.this, "网络不给力哦,亲,请返回再进入吧!", Toast.LENGTH_SHORT); return; } eles = doc.select("#Cnt-Main-Article-QQ P"); StringBuilder sb = new StringBuilder(); for (int i = 0; i < eles.size(); i++) { sb.append(eles.get(i).outerHtml()); } Message msg = new Message(); Bundle bundle = new Bundle(); bundle.putString("content", sb.toString()); Log.i("content", sb.toString()); msg.setData(bundle); msg.what = NewsContentActivity.NEWCONTENTRECEIVED; myHandler.sendMessage(msg); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } }
public static String getAddress(String string) throws Exception { Document document = Jsoup.connect(string).get(); Elements elementsByClass = document.getElementsByClass("result-title"); String attr = elementsByClass.get(0).attr("href"); String readHref = readHref(attr); return readHref; }
public static void readHead() { String url = "http://www.2177s.com"; try { Document doc = Jsoup.connect(url).timeout(10000).get(); String title = doc.title(); System.out.printf("title:%s\n", title); // Elements eles = doc.select("meta[name~=(?i)keywords|(?i)description]"); Elements eles = doc.select("meta"); System.out.println(eles.size()); for (Element ele : eles) { if (StringUtils.containsIgnoreCase(url, title)) ; if (ele.toString().matches(".*(?i)keywords.*")) { System.out.println(ele.attr("content")); } // System.out.println(ele.attr("content")); } // Elements eles = doc.getElementsByTag("meta"); // for (Element ele : eles) { // System.out.printf("keys:%s\n", ele.attr("keywords")); // System.out.printf("desc:%s\n", ele.attr("description")); // System.out.println("----------------"); // } doc = null; } catch (Exception e) { e.printStackTrace(); } }
private static Collection<Node> extractImageNodes(Element aInContent) { Collection<Node> lImageNodes = new LinkedList<>(); Elements lImageElements = aInContent.getElementsByTag("img"); if (!lImageElements.isEmpty()) { int i = 0; for (Element lImageElement : lImageElements) { i++; if (lImageElement.hasClass("float-left")) { if (!lImageElement.hasClass("alignleft")) { lImageElement.addClass("alignleft"); } } else if (lImageElement.hasClass("float-right")) { if (!lImageElement.hasClass("alignright")) { lImageElement.addClass("alignright"); } } if (i > 1) { lImageElement.removeAttr("width"); lImageElement.removeAttr("height"); } Node lThisNode = toNode(lImageElement); lImageNodes.add(lThisNode.clone()); } } return lImageNodes; }
@Override public Collection<News> crawl() { HashSet<News> news = new HashSet<>(); try { String startURL = Settings.HOMEPAGE; Document doc = Jsoup.connect(startURL).get(); Elements contents = doc.select("article"); // extract all articles out of src long counter = 1; for (Element content : contents) { // getting content for all article Elements articleLink = content.select("a.teaser__link"); Element img = articleLink.select("img").first(); String imageSrc = null; try { // try to clean image src imageSrc = img.attr("data-srcset"); imageSrc = imageSrc.split(",")[0].split(" ")[0]; } catch (Exception e) { } String title = articleLink.select("div.title__catchline").text(); String undertitle = articleLink.select("div.title__name").text(); String link = articleLink.select("[href]").attr("href"); news.add(new News(counter, title, undertitle, link, imageSrc, "DE")); counter++; } } catch (Exception ex) { System.out.println("Website not parsed!!"); return null; } return news; }
public Collection<String> extractSubscribedUser(final String htmlContent) { // logger.debug("htmlContent:\n" + htmlContent); final List<String> result = new ArrayList<String>(); final Document document = Jsoup.parse(htmlContent); final Elements tables = document.getElementsByTag("table"); for (final Element table : tables) { if (isSubscriptTable(table)) { for (final Element tr : table.getElementsByTag("tr")) { final Elements tds = tr.getElementsByTag("td"); if (!tds.isEmpty()) { final String name = tds.get(0).text(); if (name != null) { final String nameTrimed = name; if (nameTrimed.length() > 1) { logger.debug("found subscription for user: '******'"); result.add(nameTrimed); } } } } } } logger.debug("found " + result.size() + " subscribed users in htmlcontent"); return result; }
/** * begin crawling with a specific url use depth first search * * @throws IOException * @throws SQLException */ public void crawl(String starturl) throws IOException, SQLException { if (urlid >= MAXURL) // base case return; Document doc; try { doc = Jsoup.connect(starturl).get(); } catch (IOException e) { // if the url is not valid, stop the crawling process return; } catch (IllegalArgumentException e) { System.out.println("Must supply a valid URL : " + starturl); return; } if (!urlList.contains(starturl)) { urlList.add(starturl); } // if the url has already been crawled else if (urlList.contains(starturl)) { return; } Elements hrefs = doc.select("a"); urlid += 1; // terminate the process if there is no more link in a webpage if (hrefs == null || hrefs.size() == 0) return; HashMap<String, Integer> wordMap = parseHTML(getHTMLContent(starturl)); insertDBWord(starturl, wordMap, urlid); insertDBDescription(starturl, topOneHundred(starturl), urlid); for (Element e : hrefs) { String href = e.attr("href"); crawl(href); // depth first search; } }
public static void getComic(String arg) { Document doc; try { doc = Jsoup.connect(arg).get(); // String title = doc.title(); // System.out.print("Title: " + title); // Select the img tag in the comic id Elements links = doc.select("#comic img"); System.out.print("\nComic Name : " + links.attr("alt")); System.out.print("\nImage Source : " + links.attr("src") + "\n\n"); URL url = new URL(links.attr("src")); RenderedImage comic = ImageIO.read(url); String baseName = links.attr("alt").replaceAll("\\s", "_"); ImageIO.write(comic, "png", new File("/home/paranoidsp/Pictures/xkcd/" + baseName + ".png")); /* * Unfortunately, the transcript isn't formatted, so I get one * large line of text instead of readable dialogue. * TODO: Fix this. Find a way to get it. * Elements transcript = doc.select("#transcript"); System.out.print("Transcript: \n" + transcript.text()); */ } catch (IOException exp) { exp.printStackTrace(); } }
@Bean public IntegrationFlow evernoteIntegration() { return IntegrationFlows.from( this.evernoteMessageSource(), configurer -> configurer.poller(Pollers.fixedRate(pollIntervalInSeconds, TimeUnit.SECONDS))) .channel(this.inputChannel()) .filter(Collection.class, source -> !source.isEmpty()) .split() .transform( Note.class, source -> { String content = source.getContent(); if (StringUtils.isNotBlank(content)) { Document enmlDocument = Jsoup.parse(content); Elements noteElements = enmlDocument.select("en-note"); if (noteElements.size() == 1) { Element noteElement = noteElements.get(0); String wordsFromNote = noteElement.text(); if (StringUtils.isNotBlank(wordsFromNote)) { return wordsFromNote; } } } return source.getTitle(); }, configurer -> configurer.requiresReply(false)) .filter(source -> source != null) .channel(wordRequestsChannel) .get(); }
private static String replaceCidWithAttachments( String html, Map<String, Attachment> attachments) { Document doc = Jsoup.parse(html); String[] attrNames = {"src", "href"}; for (String attrName : attrNames) { Elements tags = doc.select("*[" + attrName + "]"); for (Element tag : tags) { String uriString = tag.attr(attrName).trim(); if (!uriString.toLowerCase().startsWith("cid:")) { continue; } String cid = uriString.substring("cid:".length()); if (!attachments.containsKey(cid)) { continue; } Long id = attachments.get(cid).id; tag.attr(attrName, controllers.routes.AttachmentApp.getFile(id).url()); } } Elements bodies = doc.getElementsByTag("body"); if (bodies.size() > 0) { return bodies.get(0).html(); } else { return doc.html(); } }
public HashMap<String, String> initialBestBuyScan(Document doc, String url) { doc = jsoupConnect(url); HashMap<String, String> matchingItems = new HashMap<String, String>(); matchingItems.put("price", doc.select(".medium-item-price").text()); matchingItems.put( "modelNumber", doc.select(".list-item-info .sku-model ul .model-number").text()); matchingItems.put("title", doc.select(".list-item-info .sku-title h4 a").text()); String newURL = "http://bestbuy.com" + bestBuySpecsFormatter(doc.select(".list-item-info .sku-title h4 a").attr("href")); System.out.println(newURL); doc = jsoupConnect(newURL); Elements tableEles = doc.select("#full-specifications table tbody tr"); for (Element ele : tableEles) { if (ele.text().contains("UPC")) { matchingItems.put("upc", ele.text().replace("UPC ", "")); break; } } if (tableEles.size() < 1) matchingItems.put("GoodSKU", "false"); else matchingItems.put("GoodSKU", "true"); doc.empty(); return matchingItems; }
private static void parseStatHeaderDetails(Document doc, Statistic stat) { Elements statsTrs = doc.select("table#id_stats").select("tr"); for (Element tr : statsTrs) { Elements tds = tr.select("td"); String name = tds.get(0).text().trim(); String value = tds.get(1).text().trim(); if (name != null) { if (name.startsWith("Win-Loss-Void")) { String[] values = value.split("-"); if (values != null && values.length == 3) { stat.setWin(NumberParser.parseInt(values[0])); stat.setLose(NumberParser.parseInt(values[1])); stat.setVoid_(NumberParser.parseInt(values[2])); } else { logger.warn("Win-Loss-Void section doesn't contain 3 elements as expected"); } } else if (name.startsWith("Stake avg")) { stat.setAvgStake(NumberParser.parseDouble(value)); } else if (name.startsWith("Odd avg")) { stat.setAvgOdds(NumberParser.parseDouble(value)); } else if (name.startsWith("Staked")) { stat.setStaked(NumberParser.parseDouble(value)); } else if (name.startsWith("Returned")) { stat.setReturned(NumberParser.parseDouble(value)); } } } }
private Observable<WebPageEntity> parseDocument(DownloadResult downloadResult) { Set<WebPageEntity> result = new HashSet<>(1); Document document = downloadResult.getDocument(); if (document != null) { Elements elements = document.select(".InfoArea a[title]"); if (!elements.isEmpty()) { for (Element element : elements) { WebPageEntity webPageEntity = new WebPageEntity( downloadResult.getSourcePage(), "", "productPage", element.attr("abs:href"), downloadResult.getSourcePage().getCategory()); LOGGER.info("productPageUrl={}", webPageEntity.getUrl()); result.add(webPageEntity); } } else { WebPageEntity webPageEntity = new WebPageEntity( downloadResult.getSourcePage(), "", "productPage", downloadResult.getSourcePage().getUrl(), downloadResult.getSourcePage().getCategory()); LOGGER.info("productPageUrl={}", webPageEntity.getUrl()); result.add(webPageEntity); } } return Observable.from(result); }
private static void crawl() { String url = url_tpl + (page++); Logger.info("正在抓取:%s", url); if (StringUtils.isBlank(url)) return; sleep(); Document doc = Jsoup.parse(WS.url(url).get().body, url); Elements elements = doc.select(".video-item"); if (elements.isEmpty()) return; for (Element element : elements) { try { Element link = element.select(">a").first(); String cover = link.select("img").first().absUrl("src"); String coverTitle = link.select(".v-update").first().html(); String detailUrl = link.absUrl("href"); String name = element.select(".v-desc .v-title a").first().html(); Logger.info("正在抓取名称:%s", name); Movie movie = Movie.find("byName", name).first(); if (movie == null) { movie = new Movie(); movie.id = DBCounter.generateUniqueCounter(Movie.class) + ""; } movie.name = name; movie.cover = cover; movie.cover_title = coverTitle; movie.details = getDetails( movie, "http://video.baidu.com/v?word=" + URLEncoder.encode("美剧 " + name, "GBK")); movie.save(); } catch (Exception e) { Logger.error(e.getMessage(), e); } } crawl(); }
/* * Getting news from "http://enib.net/" */ public List<News> getNews() { Document doc = null; try { doc = Jsoup.connect("http://enib.net/").get(); } catch (IOException e) { System.out.println("Can't load news"); e.printStackTrace(); } /* * Getting name, information, description and add it to the news List */ Elements getter = doc.getElementsByClass("news"); for (Element get : getter) { String news = ""; String name = get.select("h1").text(); String information = get.select("h2").text(); Elements markdown = get.getElementsByClass("markdown"); for (Element paragraph : markdown.select("p")) { news = news + paragraph.text() + System.getProperty("line.separator"); } News n = new News(name, information, news); this.news.add(n); } return this.news; }
@Override public Article run(HtmlObject htmlObject) { String html = htmlObject.getHtml(); Document doc = Jsoup.parse(html); String title = doc.select(".article h1").text(); Elements contentElement = doc.select(".article_con"); String content = ""; String contentHtml = ""; if (contentElement != null) { // contentElement.select(".author").remove(); content = contentElement.text(); contentHtml = contentElement.html(); } String Ele_data = doc.select(".article h2").text(); Matcher m1 = datePattern.matcher(Ele_data); String date = ""; if (m1.find()) { date = m1.group(1); } else { Date today = new Date(); SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd"); date = formatter.format(today); } Article model1 = new Article(); model1.setUrl(htmlObject.getUrl()); model1.setTitle(title); model1.setContent(content); model1.setPublishDate(date); model1.setArticleType(ArticleType.News); model1.setProvider("雨果网"); return model1; }
public static boolean getFormFields( ResponseWrapper rw, List<NameValuePairString> hiddenFormFields, String formSelector) { // --- analisi della pagina contente la form, specifica al sito Document doc = rw.getJSoupDocument(); Elements els = doc.select(formSelector); // per debug, dovrebbe essere uo if (els == null || els.size() <= 0) { log.error("unable to find form at selector: " + formSelector); System.exit(1); return false; } Element loginForm = els.get(0); if (loginForm == null) { log.error("failed to get form to analyze at: " + rw.dump()); System.exit(1); } // log.info("login form OUTER HTML\n" + loginForm.outerHtml()); Elements inputFields = loginForm.select("input"); // display all for (Element e : inputFields) { String type = e.attr("type"); if (type.equals("submit")) { continue; } String attrName = e.attr("name"); hiddenFormFields.add(new NameValuePairString(attrName, e.val())); log.debug("captured form input: " + attrName + " = " + e.val()); } return false; }
@Override public List<String> parseCategory(String categoryName, String categoryURL) { // TODO Auto-generated method stub List<String> linksByCategoryList = null; try { Document doc = Jsoup.connect(categoryURL).timeout(Constants.MAX_DELAY_TIME * 1000).get(); Elements links = doc.select("div[class=views-field views-field-title]").select("a"); if (links != null && links.size() > 0) { linksByCategoryList = new ArrayList<String>(); for (Element element : links) { String newsLink = element.attr("href"); newsLink = newsLink.substring(1); linksByCategoryList.add(newsLink); } } } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } return linksByCategoryList; }
/** * Method responsible for querying and parsing to correios cep locator * * @author pulu - 09/09/2013 */ private Webservicecep findAddressByCepAtCorreios(String url) throws IOException { HttpClient httpClient = new HttpClient(); PostMethod postMethod = new PostMethod(url); log.log(Level.INFO, "Querying to correios WS..."); try { httpClient.executeMethod(postMethod); if (postMethod.getStatusCode() == HttpStatus.SC_OK) { Document doc = Jsoup.parse(new URL(url).openStream(), "ISO-8859-1", url); Elements elements = doc.select("td:not([colspan]):not(:has(*))"); return new Webservicecep( Webservicecep.SUCCESS_CODE, elements.get(3).ownText(), elements.get(2).ownText(), elements.get(1).ownText(), "", elements.get(0).ownText()); } else return new Webservicecep(Webservicecep.ERROR_CODE); } catch (Exception e) { log.log(Level.WARNING, "Failed to parse html data. Possible reason: invalid cep."); return new Webservicecep(Webservicecep.ERROR_CODE); } }
private BancoMegaSena() throws IOException { this.concursos = new ArrayList<Concurso>(); File input = new File("C:\\Users\\Rodrigo Lacerda\\Downloads\\D_mgsasc (1)\\d_megasc.htm"); Document doc = Jsoup.parse(input, "UTF-8"); Elements trs = doc.getElementsByTag("tr"); System.out.println(trs.get(1).getElementsByTag("th")); for (Element tr : trs) if (tr.getElementsByTag("th").isEmpty()) { String codigo = tr.getElementsByTag("td").get(0).text(); String d1 = tr.getElementsByTag("td").get(2).text(); String d2 = tr.getElementsByTag("td").get(3).text(); String d3 = tr.getElementsByTag("td").get(4).text(); String d4 = tr.getElementsByTag("td").get(5).text(); String d5 = tr.getElementsByTag("td").get(6).text(); String d6 = tr.getElementsByTag("td").get(7).text(); boolean acumulado = tr.getElementsByTag("td").get(15).text().equals("SIM"); Concurso concurso = new Concurso(Integer.parseInt(codigo)); concurso.addNumero(Integer.parseInt(d1)); concurso.addNumero(Integer.parseInt(d2)); concurso.addNumero(Integer.parseInt(d3)); concurso.addNumero(Integer.parseInt(d4)); concurso.addNumero(Integer.parseInt(d5)); concurso.addNumero(Integer.parseInt(d6)); concurso.setAcumulado(acumulado); this.concursos.add(concurso); } }
@Override protected RemoteDetectionResult detectRemoteRepository( final ScrapeContext context, final Page page) { // cheap checks first, to quickly eliminate target without doing any remote requests if (page.getHttpResponse().getStatusLine().getStatusCode() == 200) { final Elements elements = page.getDocument().getElementsByTag("a"); if (!elements.isEmpty()) { // get "template" parent link final Element templateParentLink = getParentDirectoryElement(page); // get the page parent link (note: usually it's 1st elem, but HTTPD for example has extra // links for // column // sorting for (Element element : elements) { // if text is same and abs URLs points to same place, we got it if (templateParentLink.text().equals(element.text()) && templateParentLink.absUrl("href").equals(element.absUrl("href"))) { return new RemoteDetectionResult( RemoteDetectionOutcome.RECOGNIZED_SHOULD_BE_SCRAPED, getTargetedServer(), "Remote is a generated index page of " + getTargetedServer()); } } } } // um, we were not totally positive, this might be some web server with index page similar to // Nexus one return new RemoteDetectionResult( RemoteDetectionOutcome.UNRECOGNIZED, getTargetedServer(), "Remote is not a generated index page of " + getTargetedServer()); }
public List<MenuMeal> getMenuMeals(int number) { Document doc = null; List<MenuMeal> meals = new ArrayList<>(); try { doc = Jsoup.connect(String.format(URL, number)) .userAgent("Chrome/49.0.2623.112") .referrer("https://www.google.ru/") .timeout(7000) .get(); } catch (IOException e) { e.printStackTrace(); } if (doc == null) return meals; Elements elements = doc.select("td[width=400"); if (!elements.isEmpty()) { for (Element element : elements) { Element parent = element.parent(); MenuMeal menuMeal = new MenuMeal(); menuMeal.setDescription(parent.select("div[id=ssilka]").first().text()); String cost = parent.select("div[id=ssilka]").last().text(); menuMeal.setCost(Integer.valueOf(cost.substring(0, cost.indexOf("-")))); meals.add(menuMeal); } return meals; } else { return meals; } }
private Integer searchResults(Document document) { Integer occurences = 0; String searchResult = "0"; Elements searchResults = document.select("h2.page-title.hidden-xs"); if (searchResults.size() == 0) { // Sometimes results come in a different place, check it searchResults = document.select("div#resultsCountHeader h1.fnt12"); } if (searchResults.size() > 0) { searchResult = searchResults.get(0).text().split(" ")[0]; } // When the result is more than 1000 we get 1000+, so we delete the + sign if (searchResult.endsWith("+")) { searchResult = searchResult.substring(0, searchResult.length() - 1); } try { // We deal with results like 'Zero' or 'Sorry, none job...' occurences = Integer.parseInt(searchResult.replace(",", "")); } catch (NumberFormatException e) { System.out.println("Error parsing:" + searchResult); occurences = 0; } return occurences; }