private List<ArtifactVersionBean> parseMavenMetadata(Document doc) { String groupId = doc.getElementsByTag("groupId").text(); String artifactId = doc.getElementsByTag("artifactId").text(); if (!StringUtils.hasText(groupId) || !StringUtils.hasText(artifactId)) { return Lists.newArrayListWithCapacity(0); } Elements versions = doc.getElementsByTag("version"); List<ArtifactVersionBean> artifactList = Lists.newArrayList(); for (Element version : versions) { ArtifactVersionBean artifactVersionBean = new ArtifactVersionBean(); artifactVersionBean.setGroupId(groupId); artifactVersionBean.setArtifactId(artifactId); artifactVersionBean.setVersion(version.text()); artifactVersionBean.setId(groupId + ":" + artifactId + ":" + version.text()); // Gets and convert the last update date Long lastUpdateDate = retrieveLastUpdateDate(artifactVersionBean); if (lastUpdateDate == null) { continue; } artifactVersionBean.setTimestamp(lastUpdateDate); artifactList.add(artifactVersionBean); } return artifactList; }
public String reviseContForLieyunwang(String pcont) { if (pcont == null) return ""; Document doc = Jsoup.parse(pcont); Elements eles = doc.select("div#share-box"); for (Element ele : eles) { ele.remove(); } eles = doc.select("div[id^=BAIDU]"); for (Element ele : eles) { ele.remove(); } eles = doc.select("iframe[id^=360_HOT]"); for (Element ele : eles) { ele.remove(); } eles = doc.select("div.n_article"); for (Element ele : eles) { ele.remove(); } eles = doc.select("div#comment-box"); for (Element ele : eles) { ele.remove(); } return doc.html(); }
@GET @Path("/logs") @Produces({MediaType.TEXT_HTML}) public String logs() { Document doc = null; try { ArrayList<Logger.Log> logs = Logger.getInstance().getLastLogs(); File file = new File(getClass().getClassLoader().getResource("logs.html").getFile()); doc = Jsoup.parse(file, "UTF-8"); Element tbody = doc.getElementById("logs"); for (Logger.Log log : logs) { Element tr = tbody.appendElement("tr").addClass(log.getType_log()); tr.appendElement("td").addClass("type").text(log.getType_log()); tr.appendElement("td").addClass("date").text(log.getDate().toString()); tr.appendElement("td").addClass("message").text(log.getMessage()); } } catch (Exception e) { e.printStackTrace(); } if (doc != null) { return doc.html(); } return null; }
public Worker(String url, boolean verbose) throws Exception { Document doc; doc = Jsoup.connect(url).get(); // select anchors with href only Elements links = doc.select("a[href]"); String l_Href; String host; int linksNum; Parser parser; for (Element link : links) { // absolute = http:// added l_Href = link.attr("abs:href"); if (!l_Href.isEmpty()) { parser = new Parser(l_Href); host = parser.getHost(); // if tempStats contains the url, add one to the value if (tempStats.containsKey(host)) { linksNum = tempStats.get(host); tempStats.put(host, linksNum += 1); } // if it doesn't, add it else { tempStats.put(host, 1); } // parse the url tempQueue.add(parser.getURL()); } } if (verbose) { System.out.println( Thread.currentThread().getName() + " : " + tempQueue.size() + " links from " + url); } }
@Override public void upload( ComponentParameter compParameter, IMultipartFile multipartFile, HashMap<String, Object> json) { try { ID id = ItSiteUtil.getLoginUser(compParameter).getId(); if (id != null) { final Document document = Jsoup.parse( multipartFile.getInputStream(), compParameter.request.getCharacterEncoding(), ""); final Elements as = document.getElementsByTag("a"); for (final Element a : as) { if (a.hasAttr("add_date")) { final BookmarkBean bean = new BookmarkBean(); final long t = ConvertUtils.toLong(a.attr("add_date"), 0) * 1000; bean.setTitle(a.text()); bean.setUrl(a.attr("href")); bean.setUserId(id); bean.setUpdateDate(new Date(t)); try { BookmarkUtils.applicationModule.doUpdate(bean); } catch (Exception e) { } } } } } catch (final Exception e) { throw DataObjectException.wrapException("没有权限"); } }
@Override public List<String> parseCategory(String categoryName, String categoryURL) { // TODO Auto-generated method stub List<String> linksByCategoryList = null; try { Document doc = Jsoup.connect(categoryURL).timeout(Constants.MAX_DELAY_TIME * 1000).get(); Elements links = doc.select("div[class=views-field views-field-title]").select("a"); if (links != null && links.size() > 0) { linksByCategoryList = new ArrayList<String>(); for (Element element : links) { String newsLink = element.attr("href"); newsLink = newsLink.substring(1); linksByCategoryList.add(newsLink); } } } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } return linksByCategoryList; }
public static ArrayList<EntryModel> getPopularContent() { final ArrayList<EntryModel> result = new ArrayList<>(); Thread thread = new Thread( () -> { try { Document document = Jsoup.connect("http://jkanime.net/").get(); Elements elements = document.getElementsByClass("home_portada_bg"); for (Element element : elements) { result.add( new EntryModel( Constants.TYPE_SHOW, element.getElementsByTag("a").first().text(), element.getElementsByTag("a").first().attr("abs:href"), element.getElementsByTag("img").first().attr("src"))); } } catch (IOException e) { e.printStackTrace(); } }); thread.start(); try { thread.join(); return result; } catch (InterruptedException | NullPointerException e) { e.printStackTrace(); return null; } }
@Override public SearchResult[] getSearchResults(String searchString) throws IOException { Document doc = Jsoup.connect(searchString).timeout(CONNECTION_TIMEOUT_VALUE).get(); boolean onSearchResultsPage = doc.location().contains("adultSearch.htm"); // found the movie without a search results page if (doc.location() != null && !onSearchResultsPage) { String idOfPage = getIDStringFromDocumentLocation(doc); String posterPath = getPosterPreviewPathFromIDString(idOfPage); String label = doc.select("title").first().text(); Thumb previewImage = new Thumb(posterPath); // SearchResult directResult = new SearchResult(doc.location()); SearchResult result = null; if (posterPath != null) result = new SearchResult(doc.location(), label, previewImage); else result = new SearchResult(doc.location(), label, null); SearchResult[] directResultArray = {result}; return directResultArray; } Elements foundMovies = doc.select("table[width=690]:contains(Wish List) tr tbody:has(img)"); LinkedList<SearchResult> searchList = new LinkedList<SearchResult>(); for (Element movie : foundMovies) { String urlPath = movie.select("a").first().attr("href"); String thumb = movie.select("img").first().attr("src"); String label = movie.select("img").first().attr("alt"); SearchResult searchResult = new SearchResult(urlPath, label, new Thumb(thumb)); if (!searchList.contains(searchResult)) searchList.add(searchResult); } return searchList.toArray(new SearchResult[searchList.size()]); }
public static Element markTestElement(Element element) { element.traverse( new NodeVisitor() { @Override public void tail(Node node, int level) {} @Override public void head(Node node, int level) { node.attr("class", nodeMarker.matcher(node.attr("class")).replaceAll("")); } }); element.traverse( new NodeVisitor() { int count = 0; @Override public void tail(Node node, int level) {} @Override public void head(Node node, int level) { ++count; node.attr("class", node.attr("class") + " " + NODE_MARKER + "0_" + count + " "); } }); return element; }
/** * getMovieActors parses through the movie's page html and returns three actors. * * @author defq0n * @param pageLink is the extended imdb url for the movie page. * @return movieActors String containing three actors. */ private static String[] getMovieActors(String pageLink) { String[] movieActors = {"", "", ""}; try { Document d = Jsoup.connect("http://imdb.com" + pageLink).get(); Element e = d.body(); String html = e.toString(); String actorsDiv = ""; for (int i = html.indexOf("<h4 class=\"inline\">Stars:</h4>") + 30; i < html.indexOf("See full cast and crew"); i++) { actorsDiv += html.charAt(i); } String tempDiv = actorsDiv; for (int i = 0; i < 3; i++) { // we will get the first three top actors String actor = ""; String t = "itemprop=\"url\"><span class=\"itemprop\" itemprop=\"name\">"; for (int j = tempDiv.indexOf(t) + t.length(); j < tempDiv.indexOf("</span></a>"); j++) { actor += tempDiv.charAt(j); } movieActors[i] = actor; tempDiv = ""; for (int j = actorsDiv.indexOf(actor + "</span>") + actor.length() + 7; j < actorsDiv.length(); j++) { tempDiv += actorsDiv.charAt(j); } } } catch (Exception e) { System.out.println(e.toString()); } return movieActors; }
private static String getTrailer(Movie movie) { String trailerLink = ""; if (Integer.valueOf(movie.getMovieYear()) < 1990) { trailerLink = "null"; } else { trailerLink += "http://www.youtube.com"; String link = formatYoutubeString(movie.getMovieName()); try { Document d = Jsoup.connect("http://www.youtube.com/" + link).get(); Element e = d.body(); String html = e.toString(); String linkDiv = ""; int max = html.indexOf("class=\"yt-lockup-title \"><a href=\"") + 100; for (int i = html.indexOf("class=\"yt-lockup-title \"><a href=\""); i < max; i++) { linkDiv += html.charAt(i); } for (int i = linkDiv.indexOf("<a href=\"") + 9; i < linkDiv.indexOf("class=\"yt-uix-sessionlink") - 2; i++) { trailerLink += linkDiv.charAt(i); } } catch (Exception e) { System.out.println(e.toString()); } } return trailerLink; }
/** If there are elements inside our top node that have a negative gravity score remove them */ protected void removeNodesWithNegativeScores(Element topNode) { Elements gravityItems = topNode.select("*[gravityScore]"); for (Element item : gravityItems) { int score = Integer.parseInt(item.attr("gravityScore")); if (score < 0 || item.text().length() < minParagraphText) item.remove(); } }
@Bean public IntegrationFlow evernoteIntegration() { return IntegrationFlows.from( this.evernoteMessageSource(), configurer -> configurer.poller(Pollers.fixedRate(pollIntervalInSeconds, TimeUnit.SECONDS))) .channel(this.inputChannel()) .filter(Collection.class, source -> !source.isEmpty()) .split() .transform( Note.class, source -> { String content = source.getContent(); if (StringUtils.isNotBlank(content)) { Document enmlDocument = Jsoup.parse(content); Elements noteElements = enmlDocument.select("en-note"); if (noteElements.size() == 1) { Element noteElement = noteElements.get(0); String wordsFromNote = noteElement.text(); if (StringUtils.isNotBlank(wordsFromNote)) { return wordsFromNote; } } } return source.getTitle(); }, configurer -> configurer.requiresReply(false)) .filter(source -> source != null) .channel(wordRequestsChannel) .get(); }
public static void initMajorList(String originalUrl) { System.out.println("preparing majorList"); boolean finish = false; do { try { majorList.clear(); Connection conn = Jsoup.connect(originalUrl); Document doc = conn.timeout(10000).get(); Elements es = doc.select("#accordion__target-3 > div.course-listing__box > a"); for (Element e : es) { // major MajorForCollection major = new MajorForCollection(); major.setLevel(LEVEL); major.setTitle(e.select("h3").get(0).text().trim()); major.setType(e.select("p").get(0).text().replaceAll("-[\\s\\S]*", "").trim()); major.setUrl(e.select("a").get(0).attr("href")); majorList.add(major); } ; finish = true; } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } } while (!finish); System.out.println("majorList prepared"); System.out.println("majorList size: " + majorList.size()); }
private static String makeModular(String html) { String text = ""; Document doc = Jsoup.parse(html); Elements els = doc.getAllElements(); boolean moved = false; String url = ""; for (Element el : els) { switch (el.nodeName()) { case "title": text = el.text(); if (text.toLowerCase().contains("moved") && text.toLowerCase().contains("permanently")) { moved = true; } break; case "body": if (moved) { url = getMovedUrl(el); } break; default: break; } } if (moved) { getMovedUrl(doc); } return text; }
private static String replaceCidWithAttachments( String html, Map<String, Attachment> attachments) { Document doc = Jsoup.parse(html); String[] attrNames = {"src", "href"}; for (String attrName : attrNames) { Elements tags = doc.select("*[" + attrName + "]"); for (Element tag : tags) { String uriString = tag.attr(attrName).trim(); if (!uriString.toLowerCase().startsWith("cid:")) { continue; } String cid = uriString.substring("cid:".length()); if (!attachments.containsKey(cid)) { continue; } Long id = attachments.get(cid).id; tag.attr(attrName, controllers.routes.AttachmentApp.getFile(id).url()); } } Elements bodies = doc.getElementsByTag("body"); if (bodies.size() > 0) { return bodies.get(0).html(); } else { return doc.html(); } }
//// COMPLETAMENTE INUTILE public static int[] getPrice(String path) { int[] month = new int[31]; int count = 0; try { File input = new File(path); Document doc = Jsoup.parse(input, "UTF-8", "http://example.com/"); Elements elementi_div = doc.getElementsByTag("div"); for (Element e : elementi_div) { if (e.text().length() > 0) if (Character.isDigit(e.text().charAt(0)) && e.text().contains("€ ")) { count++; String[] arr = e.text().split(" "); month[Integer.parseInt(arr[0]) - 1] = Integer.parseInt(arr[2].replace(".", "")); } } } catch (Exception e) { System.out.println(e); } if (count == 0) { System.out.println("Non e' stato scaricato il file"); // getPrice(path); } return month; }
public static void processEpub(String bookPath, String dest) throws FileNotFoundException, IOException { EpubReader reader = new EpubReader(); Book b = reader.readEpub(new FileInputStream(new File(bookPath))); String content = ""; int pagecount = 1; int tempCounter; Count cnt = new Count(0, 0); for (Resource res : b.getContents()) { content = new String(res.getData()); Document doc = Jsoup.parse(content, "UTF-8"); // http-equiv=\"content-type\" content=\"text/html; charset=utf-8\""); Element elem = new Element(Tag.valueOf("meta"), ""); elem.attr("http-equiv", "content-type"); elem.attr("content", "text/html; charset=utf-8"); doc.head().after(elem); System.out.println(doc.head().data()); Element ele = doc.body(); alterElement(ele); Count cTemp = modify(ele, cnt); cnt.setCount(cTemp.getCount()); cnt.setPgCount(cTemp.getPgCount()); doc.body().html(ele.html()); res.setData(doc.html().getBytes()); if (res.getMediaType() == null) res.setMediaType(new MediaType("html", "html")); } EpubWriter wr = new EpubWriter(); wr.write(b, new FileOutputStream(new File(dest))); }
public static ArrayList<EntryModel> getSearchResults(final String query) { final ArrayList<EntryModel> result = new ArrayList<>(); Thread thread = new Thread( () -> { try { Document document = Jsoup.connect("http://jkanime.net/buscar/" + query.replace(" ", "_")).get(); Elements elements = document.getElementsByClass("search"); for (Element element : elements) { String title = element.getElementsByClass("titl").first().text(); String url = element.getElementsByClass("titl").first().attr("abs:href"); String picUrl = element.getElementsByTag("img").first().attr("src"); result.add(new EntryModel(Constants.TYPE_SHOW, title, url, picUrl)); } } catch (IOException e) { e.printStackTrace(); } }); thread.start(); try { thread.join(); return result; } catch (InterruptedException | NullPointerException e) { e.printStackTrace(); return null; } }
/** * Recursively writes a data source Item and its children to a design. * * @since 7.5.0 * @param design the element into which to insert the item * @param itemId the id of the item to write * @param context the DesignContext instance used in writing * @return */ @Override protected Element writeItem(Element design, Object itemId, DesignContext context) { Element element = design.appendElement("node"); element.attr("text", itemId.toString()); Resource icon = getItemIcon(itemId); if (icon != null) { DesignAttributeHandler.writeAttribute( "icon", element.attributes(), icon, null, Resource.class); } if (isSelected(itemId)) { element.attr("selected", ""); } Collection<?> children = getChildren(itemId); if (children != null) { // Yeah... see #5864 for (Object childItemId : children) { writeItem(element, childItemId, context); } } return element; }
/** * 从网址里面抽取链接 * * @return 链接的集合 */ public static List<String> getUrlsByPage(String str) { List<String> urls = new ArrayList<String>(); try { URL url = new URL(str); int end = 0; String host = url.getHost(); Document doc = Jsoup.parse(url, 30000); Elements links = doc.select("a"); String href = null; for (Element link : links) { href = link.attr("href"); if (href.startsWith(HTTP)) { urls.add(href); } else if (href.startsWith("/")) { urls.add(HTTP + host + href); } else { if (end > 0) { urls.add(str + href); } else { urls.add(str + href); } } } } catch (MalformedURLException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } return urls; }
private static void parseStatHeaderDetails(Document doc, Statistic stat) { Elements statsTrs = doc.select("table#id_stats").select("tr"); for (Element tr : statsTrs) { Elements tds = tr.select("td"); String name = tds.get(0).text().trim(); String value = tds.get(1).text().trim(); if (name != null) { if (name.startsWith("Win-Loss-Void")) { String[] values = value.split("-"); if (values != null && values.length == 3) { stat.setWin(NumberParser.parseInt(values[0])); stat.setLose(NumberParser.parseInt(values[1])); stat.setVoid_(NumberParser.parseInt(values[2])); } else { logger.warn("Win-Loss-Void section doesn't contain 3 elements as expected"); } } else if (name.startsWith("Stake avg")) { stat.setAvgStake(NumberParser.parseDouble(value)); } else if (name.startsWith("Odd avg")) { stat.setAvgOdds(NumberParser.parseDouble(value)); } else if (name.startsWith("Staked")) { stat.setStaked(NumberParser.parseDouble(value)); } else if (name.startsWith("Returned")) { stat.setReturned(NumberParser.parseDouble(value)); } } } }
/** * This methods checks whether elements have a child element of with a given attribute. * * @param elements * @param testSolutionHandler */ private void checkChildElementWithAttributePresence( Elements elements, TestSolutionHandler testSolutionHandler) { if (elements.isEmpty()) { testSolutionHandler.addTestSolution(TestSolution.NOT_APPLICABLE); return; } TestSolution testSolution = TestSolution.PASSED; for (Element el : elements) { if (!el.getElementsByAttribute(attributeName).isEmpty()) { testSolution = setTestSolution(testSolution, getSuccessSolution()); addSourceCodeRemark(getSuccessSolution(), el, getSuccessMsgCode()); } else { testSolution = setTestSolution(testSolution, getFailureSolution()); addSourceCodeRemark(getFailureSolution(), el, getFailureMsgCode()); } } testSolutionHandler.addTestSolution(testSolution); }
/** * Take links from results and do pagination (max 7 times). * * @param document * @return */ @Override public List<URL> getNextPages(Document document) { List<URL> urls = new ArrayList<>(); // Collect rows with links to comparing offerts links Elements elements = document.select(PRODUCTS_ROW_QUERY + ":not([onclick])"); for (Element element : elements) { String str = element.attr("abs:href"); try { urls.add(Utils.stringToURL(str)); } catch (ConnectionException e) { } } // Pagination final int MAX_PAGE = 7; Element next = document.select("a[href].next").first(); if (next != null) { String nextStr = next.attr("href"); if (!nextStr.contains("page_nr=" + MAX_PAGE)) { try { urls.add(Utils.stringToURL(nextStr)); } catch (ConnectionException e) { } } } logger.debug("Collected " + urls.size() + " urls to visit"); return urls; }
@Test public void designIsSerializedWithCorrectPrefixesAndPackageNames() throws IOException { ByteArrayOutputStream out = serializeDesign(ctx); // Check the mapping from prefixes to package names using the html tree String[] expectedPrefixes = {"my"}; String[] expectedPackageNames = {"com.addon.mypackage"}; int index = 0; Document doc = Jsoup.parse(out.toString("UTF-8")); Element head = doc.head(); for (Node child : head.childNodes()) { if ("meta".equals(child.nodeName())) { String name = child.attributes().get("name"); if ("package-mapping".equals(name)) { String content = child.attributes().get("content"); String[] parts = content.split(":"); assertEquals("Unexpected prefix.", expectedPrefixes[index], parts[0]); assertEquals("Unexpected package name.", expectedPackageNames[index], parts[1]); index++; } } } assertEquals("Unexpected number of prefix - package name pairs.", 1, index); }
@Override protected void initialize(Element source) { Elements elements = source.getElementsByTag("td"); Element element = elements.get(0).select("[data-sc-params]").get(0); String name = element .attr("data-sc-params") .replaceAll("\\{ 'name': '", "") .replaceAll("', 'magnet':.*", "") .replaceAll("%20", "\\.") .replaceAll("%5B.*", ""); ShowData showData = ShowData.fromFilename(name); initialize(showData); seeds = Integer.parseInt(elements.get(4).text()); peers = Integer.parseInt(elements.get(5).text()); element = elements.get(0).select("div a[title=Download torrent file]").get(0); String[] array = element.attr("href").split("\\?"); downloadLink = array[0].replaceAll("\\.torrent", "/temp\\.torrent"); if (downloadLink.startsWith("//")) { downloadLink = "http:" + downloadLink; } }
public String reviseContForTieba(String pcont) { if (pcont == null) return ""; Document doc = Jsoup.parse(pcont); Elements eles = doc.select("div.BAIDU_CLB_AD"); for (Element ele : eles) { ele.remove(); } eles = doc.select("ul.p_mtail"); for (Element ele : eles) { ele.remove(); } eles = doc.select("ul.p_props_tail"); for (Element ele : eles) { ele.remove(); } eles = doc.select("div.thread_recommend"); for (Element ele : eles) { ele.remove(); } eles = doc.select("div.j_lzl_container"); for (Element ele : eles) { ele.remove(); } return doc.html(); }
/** * Do not paginate. * * @param document * @return */ @Override public List<URL> getNextPages(Document document) { List<URL> urls = new ArrayList<>(); String nextStrUrl = null; // Pagination /*URL res; try { Elements elements = document.getElementsByClass("next"); Element next = elements.first().select("a").first(); nextStrUrl = next.attr("abs:href"); } catch (NullPointerException e) { return null; } try { res = Utils.stringToURL(nextStrUrl); } catch (ConnectionException e) { logger.debug(e.toString()); return null; } urls.add(res);*/ for (Element element : document.select("div#productView > div.productCompare")) { String href = element.select("a[href].buttonRetail").first().attr("abs:href"); try { urls.add(Utils.stringToURL(href)); } catch (ConnectionException e) { } } return urls; }
public List<MenuMeal> getMenuMeals(int number) { Document doc = null; List<MenuMeal> meals = new ArrayList<>(); try { doc = Jsoup.connect(String.format(URL, number)) .userAgent("Chrome/49.0.2623.112") .referrer("https://www.google.ru/") .timeout(7000) .get(); } catch (IOException e) { e.printStackTrace(); } if (doc == null) return meals; Elements elements = doc.select("td[width=400"); if (!elements.isEmpty()) { for (Element element : elements) { Element parent = element.parent(); MenuMeal menuMeal = new MenuMeal(); menuMeal.setDescription(parent.select("div[id=ssilka]").first().text()); String cost = parent.select("div[id=ssilka]").last().text(); menuMeal.setCost(Integer.valueOf(cost.substring(0, cost.indexOf("-")))); meals.add(menuMeal); } return meals; } else { return meals; } }
public static void processPage(String URL) throws SQLException, IOException { // check if the given URL is already in database String sql = "select * from Record where URL = '" + URL + "'"; ResultSet rs = db.runSql(sql); if (rs.next()) { } else { // store the URL to database to avoid parsing again sql = "INSERT INTO test.Record " + "(URL) VALUES " + "(?);"; PreparedStatement stmt = db.conn.prepareStatement(sql, Statement.RETURN_GENERATED_KEYS); stmt.setString(1, URL); stmt.execute(); // get useful information Document doc = Jsoup.connect("http://www.mit.edu/").get(); if (doc.text().contains("PhD")) { System.out.println(URL); } // get all links and recursively call the processPage method Elements questions = doc.select("a[href]"); for (Element link : questions) { if (link.attr("href").contains("mit.edu")) processPage(link.attr("abs:href")); } } }