private Element cleanupElement(Element el) {
   Tag newTag = null;
   String newText = null;
   if (el.nodeName().equals("img")) {
     newTag = Tag.valueOf("x");
     newText = el.attr("src");
   }
   if (el.nodeName().equals("em")) {
     newTag = Tag.valueOf("b");
   }
   if (el.nodeName().equals("a")) {
     String clazz = el.attr("class");
     if (clazz.equals("user")) {
       newTag = Tag.valueOf("x");
       newText = "@" + el.text().trim();
     } else if (clazz.startsWith("postimg video")) {
       newTag = Tag.valueOf("x");
       newText = "VIDEO: " + el.attr("href") + " THUMBNAIL: " + el.select("img").attr("src");
     } else if (clazz.startsWith("postimg")) {
       newTag = Tag.valueOf("x");
     } else if (clazz.equals("post")) {
       newTag = Tag.valueOf("x");
     } else {
       newTag = Tag.valueOf("x");
       newText = el.attr("href");
     }
   }
   if (el.nodeName().equals("div")) {
     newTag = Tag.valueOf("x");
   }
   Element nel;
   if (newTag == null) {
     // el = el;
     nel = new Element(el.tag(), "");
     //            for(List<Node> children = nel.childNodes(); children.size() > 0; children =
     // nel.childNodes()) {
     //                children.get(0).remove();
     //            }
   } else {
     nel = new Element(newTag, "");
   }
   if (newText != null) {
     nel.appendChild(new TextNode(newText, ""));
   } else {
     List<Node> children = el.childNodes();
     for (Node child : children) {
       if (child instanceof Element) {
         nel.appendChild(cleanupElement((Element) child));
       } else {
         nel.appendChild(new TextNode(child.toString(), ""));
       }
     }
   }
   return nel;
 }
Пример #2
0
 private static String makeModular(String html) {
   String text = "";
   Document doc = Jsoup.parse(html);
   Elements els = doc.getAllElements();
   boolean moved = false;
   String url = "";
   for (Element el : els) {
     switch (el.nodeName()) {
       case "title":
         text = el.text();
         if (text.toLowerCase().contains("moved") && text.toLowerCase().contains("permanently")) {
           moved = true;
         }
         break;
       case "body":
         if (moved) {
           url = getMovedUrl(el);
         }
         break;
       default:
         break;
     }
   }
   if (moved) {
     getMovedUrl(doc);
   }
   return text;
 }
Пример #3
0
  private static String kurwaFunction() {
    try {
      // Url URL = new Url();
      for (String url : urls) {
        url = cleanUrl(url); // always clean first
        if (url.contains("t.co") || url.contains("bit.ly")) {
          URLUnshortener shortener = new URLUnshortener();
          URL url2 = shortener.expand(new URL(url));
          url = url2.toString();
        }
        Document doc = Jsoup.connect(url).get();
        Elements els = doc.getAllElements();
        if (els.hasAttr("title") || els.hasAttr("h1")) {
          for (Element el : els) {
            boolean href = true;
            if (el.nodeName().equals("a") && href) {
              if (el.text().toLowerCase().equals("here")) {
                Element el1 = el.attr("href", "");
                System.out.println(el1);
              }
            }
            switch (el.nodeName()) {
              case "title":
                System.out.println(el.text());
                if (!el.text().isEmpty()) {
                  break;
                }
              case "h1":
                if (el.text().contains("moved") && el.text().contains("has")) {
                  href = true;
                }
                break;
              default:
                break;
            }
          }
        }
      }

    } catch (HttpStatusException ex) {
      System.out.println("Exception http " + ex.getStatusCode());
    } catch (IOException ex) {
      Logger.getLogger(test.class.getName()).log(Level.SEVERE, null, ex);
    }
    return "";
  }
Пример #4
0
 private static String getMovedUrl(Element el) {
   String url = "";
   Elements body = el.getAllElements();
   for (Element bodyel : body) {
     if (bodyel.nodeName().equals("a")) {
       url = bodyel.absUrl("href");
     }
   }
   return url;
 }
Пример #5
0
  public static List<CaoImg> getImgData(String url) throws Exception {
    String response = HttpUtils.getString(url);

    Document parse = Jsoup.parse(response);
    Elements allElements = parse.getAllElements();

    List<CaoImg> caoImgs = new ArrayList<CaoImg>();

    for (int i = 0; i < allElements.size(); i++) {
      Element element = allElements.get(i);

      // <table class="wikitable"
      // style="width: 22em; position: absolute; top: 0px; left: 0px;">
      String nodeName = element.nodeName();
      String attrClass = element.attr("class");
      if (nodeName.equals("table") && "wikitable".equals(attrClass + "")) {
        String title = element.getElementsByAttribute("title").get(0).attr("title");
        Elements imgElement = element.getElementsByTag("img");
        String src = imgElement.attr("src");

        Elements styleElements =
            element.getElementsByAttributeValueContaining("style", "font-size");
        String otherName = null;
        String intro = null;
        if (styleElements.size() == 1) {
          intro = styleElements.get(0).text();
        } else {
          otherName = styleElements.get(0).text();
          intro = styleElements.get(1).text();
        }

        CaoImg caoImg = new CaoImg();
        caoImg.setName(title);
        caoImg.setImg(src);
        caoImg.setOtherName(otherName);
        caoImg.setIntro(intro);

        caoImgs.add(caoImg);
      }
    }

    return caoImgs;
  }
Пример #6
0
  /**
   * 获取当前html文本中所有可能存在图片的地址
   *
   * @param html
   * @return
   */
  public static List<String> getImagesOrLinks(String html) {
    Document doc = Jsoup.parse(html);
    Elements eles = doc.select("img,a");
    List<String> result = new LinkedList<>();
    for (Element element : eles) {
      boolean isa = "a".equals(element.nodeName());
      String link = element.attr(isa ? "href" : "src");
      if (StringUtils.isBlank(link)) continue;

      if (isa) {
        int question = link.indexOf("?");
        if (question > 0) link = link.substring(0, question);
        int comma = link.lastIndexOf(".");
        String ext = link.substring(comma + 1).toLowerCase();
        if (FileUtils.isImageExt(ext)) {
          result.add(link);
        }
      } else {
        result.add(link);
      }
    }

    return result;
  }
 /**
  * Check if an HTML input is an image input type.
  *
  * @param element
  * @return
  */
 static boolean isImageInput(Element element) {
   return element.nodeName().equals(INPUT)
       && element.hasAttr(TYPE)
       && element.attr(TYPE).equals(IMAGE);
 }