private Element cleanupElement(Element el) { Tag newTag = null; String newText = null; if (el.nodeName().equals("img")) { newTag = Tag.valueOf("x"); newText = el.attr("src"); } if (el.nodeName().equals("em")) { newTag = Tag.valueOf("b"); } if (el.nodeName().equals("a")) { String clazz = el.attr("class"); if (clazz.equals("user")) { newTag = Tag.valueOf("x"); newText = "@" + el.text().trim(); } else if (clazz.startsWith("postimg video")) { newTag = Tag.valueOf("x"); newText = "VIDEO: " + el.attr("href") + " THUMBNAIL: " + el.select("img").attr("src"); } else if (clazz.startsWith("postimg")) { newTag = Tag.valueOf("x"); } else if (clazz.equals("post")) { newTag = Tag.valueOf("x"); } else { newTag = Tag.valueOf("x"); newText = el.attr("href"); } } if (el.nodeName().equals("div")) { newTag = Tag.valueOf("x"); } Element nel; if (newTag == null) { // el = el; nel = new Element(el.tag(), ""); // for(List<Node> children = nel.childNodes(); children.size() > 0; children = // nel.childNodes()) { // children.get(0).remove(); // } } else { nel = new Element(newTag, ""); } if (newText != null) { nel.appendChild(new TextNode(newText, "")); } else { List<Node> children = el.childNodes(); for (Node child : children) { if (child instanceof Element) { nel.appendChild(cleanupElement((Element) child)); } else { nel.appendChild(new TextNode(child.toString(), "")); } } } return nel; }
private static String makeModular(String html) { String text = ""; Document doc = Jsoup.parse(html); Elements els = doc.getAllElements(); boolean moved = false; String url = ""; for (Element el : els) { switch (el.nodeName()) { case "title": text = el.text(); if (text.toLowerCase().contains("moved") && text.toLowerCase().contains("permanently")) { moved = true; } break; case "body": if (moved) { url = getMovedUrl(el); } break; default: break; } } if (moved) { getMovedUrl(doc); } return text; }
private static String kurwaFunction() { try { // Url URL = new Url(); for (String url : urls) { url = cleanUrl(url); // always clean first if (url.contains("t.co") || url.contains("bit.ly")) { URLUnshortener shortener = new URLUnshortener(); URL url2 = shortener.expand(new URL(url)); url = url2.toString(); } Document doc = Jsoup.connect(url).get(); Elements els = doc.getAllElements(); if (els.hasAttr("title") || els.hasAttr("h1")) { for (Element el : els) { boolean href = true; if (el.nodeName().equals("a") && href) { if (el.text().toLowerCase().equals("here")) { Element el1 = el.attr("href", ""); System.out.println(el1); } } switch (el.nodeName()) { case "title": System.out.println(el.text()); if (!el.text().isEmpty()) { break; } case "h1": if (el.text().contains("moved") && el.text().contains("has")) { href = true; } break; default: break; } } } } } catch (HttpStatusException ex) { System.out.println("Exception http " + ex.getStatusCode()); } catch (IOException ex) { Logger.getLogger(test.class.getName()).log(Level.SEVERE, null, ex); } return ""; }
private static String getMovedUrl(Element el) { String url = ""; Elements body = el.getAllElements(); for (Element bodyel : body) { if (bodyel.nodeName().equals("a")) { url = bodyel.absUrl("href"); } } return url; }
public static List<CaoImg> getImgData(String url) throws Exception { String response = HttpUtils.getString(url); Document parse = Jsoup.parse(response); Elements allElements = parse.getAllElements(); List<CaoImg> caoImgs = new ArrayList<CaoImg>(); for (int i = 0; i < allElements.size(); i++) { Element element = allElements.get(i); // <table class="wikitable" // style="width: 22em; position: absolute; top: 0px; left: 0px;"> String nodeName = element.nodeName(); String attrClass = element.attr("class"); if (nodeName.equals("table") && "wikitable".equals(attrClass + "")) { String title = element.getElementsByAttribute("title").get(0).attr("title"); Elements imgElement = element.getElementsByTag("img"); String src = imgElement.attr("src"); Elements styleElements = element.getElementsByAttributeValueContaining("style", "font-size"); String otherName = null; String intro = null; if (styleElements.size() == 1) { intro = styleElements.get(0).text(); } else { otherName = styleElements.get(0).text(); intro = styleElements.get(1).text(); } CaoImg caoImg = new CaoImg(); caoImg.setName(title); caoImg.setImg(src); caoImg.setOtherName(otherName); caoImg.setIntro(intro); caoImgs.add(caoImg); } } return caoImgs; }
/** * 获取当前html文本中所有可能存在图片的地址 * * @param html * @return */ public static List<String> getImagesOrLinks(String html) { Document doc = Jsoup.parse(html); Elements eles = doc.select("img,a"); List<String> result = new LinkedList<>(); for (Element element : eles) { boolean isa = "a".equals(element.nodeName()); String link = element.attr(isa ? "href" : "src"); if (StringUtils.isBlank(link)) continue; if (isa) { int question = link.indexOf("?"); if (question > 0) link = link.substring(0, question); int comma = link.lastIndexOf("."); String ext = link.substring(comma + 1).toLowerCase(); if (FileUtils.isImageExt(ext)) { result.add(link); } } else { result.add(link); } } return result; }
/** * Check if an HTML input is an image input type. * * @param element * @return */ static boolean isImageInput(Element element) { return element.nodeName().equals(INPUT) && element.hasAttr(TYPE) && element.attr(TYPE).equals(IMAGE); }