private static void loadCSS(String address, HTMLDocument document) throws Exception {
    CSSData cssData = new CSSData();
    document.putResource("CSS.DATA", cssData);

    NodePath nodePath = pathParser.toPath("HEAD");
    HTMLNode head = extractor.lookNode(document.getRoot(), nodePath);

    URLUtils urlUtils = new URLUtils();
    NodeIterator iterator = head.iterator();
    while (iterator.hasNext()) {
      HTMLNode node = iterator.next();
      if (!node.isNode(Name.LINK)) continue;
      Attributes attributes = node.getAttributes();
      Attribute attribute = attributes.get("type");
      if (attribute == null) continue;
      if (!"text/css".equalsIgnoreCase(attribute.getValue())) continue;

      attribute = attributes.get("href");
      if (attribute == null) continue;
      String link = attribute.getValue();
      if (link == null) continue;

      link = urlUtils.createURL(new URL(address), link);

      System.out.println(link);
      byte[] bytes = loadContent(link);

      String css = new String(bytes, "utf-8");
      cssData.addValue(css);
    }
  }
Exemple #2
0
 boolean hasForm(HTMLNode node) {
   NodeIterator iterator = node.iterator();
   while (iterator.hasNext()) {
     HTMLNode n = iterator.next();
     if (n.isNode(Name.FORM)) return true;
     if (isFormElement(n)) return true;
   }
   return false;
 }
 protected void removeIFrameSource(HTMLNode node) {
   if (node.isNode(Name.IFRAME)) {
     Attributes attributes = node.getAttributes();
     attributes.remove("src");
   }
   List<HTMLNode> children = node.getChildren();
   if (children == null || children.size() < 1) return;
   for (int i = 0; i < children.size(); i++) {
     removeIFrameSource(children.get(i));
   }
 }
Exemple #4
0
 private int countWord(HTMLNode node) {
   if (node == null) return 0;
   NodeIterator nodeIterator = node.iterator();
   int word = 0;
   while (nodeIterator.hasNext()) {
     HTMLNode iterNode = nodeIterator.next();
     if (getAncestor(iterNode, Name.A, 0, 5) != null) continue;
     if (iterNode.isNode(Name.CONTENT)) {
       String text = iterNode.getTextValue();
       word += textCounter.countWord(text, 0, text.length());
     }
   }
   return word;
 }
Exemple #5
0
  private boolean isLinkContainer(HTMLNode node) {
    List<HTMLNode> children = node.getChildren();
    if (children == null) return false;
    if (isListNode(children)) {
      NodeIterator nodeIterator = node.iterator();
      int counter = 0;
      while (nodeIterator.hasNext()) {
        HTMLNode iterNode = nodeIterator.next();
        if (iterNode.isNode(Name.A)) counter++;
      }
      return counter >= children.size() - 3;
    }

    for (int i = 0; i < children.size(); i++) {
      if (isLinkContainer(children.get(i))) return true;
    }

    return false;
  }
Exemple #6
0
  public void remove(HTMLNode root, HTMLNode first, HTMLNode last) {
    boolean remove = true;

    List<HTMLNode> removes = new ArrayList<HTMLNode>();
    NodeIterator nodeIterator = root.iterator();
    //    System.out.println(new String(first.getValue()));
    while (nodeIterator.hasNext()) {
      HTMLNode node = nodeIterator.next();
      switch (node.getName()) {
        case UL:
          if (isLinkContainer(node)) removes.add(node);
          break;
        case DIV:
        case TD:
          if (isLinkDiv(node)) removes.add(node);
          break;
        case CONTENT:
          //        System.out.println(new String(iterNode.getValue()));
          if (node == first) remove = false;
          if (remove) {
            removes.add(node);
            //          System.out.println(new String(node.getValue()));
          }
          if (node == last) remove = true;
          break;
        default:
          break;
      }
    }
    //    System.out.println(new String(last.getValue()));

    for (int i = 0; i < removes.size(); i++) {
      HTMLNode node = removes.get(i);
      HTMLNode parent = node.getParent();
      //      System.out.println(parent.getTextValue());
      while (parent != null) {
        //        System.out.println(parent.getTextValue());
        node.setValue(new char[] {});
        parent.removeChild(node);

        int word = countWord(parent);
        if (word > 15) break;
        node = parent;
        parent = node.getParent();
      }
    }
  }
Exemple #7
0
 private boolean isFormElement(HTMLNode node) {
   switch (node.getName()) {
     case INPUT:
     case TEXTAREA:
     case SELECT:
     case LABEL:
     case BUTTON:
       return true;
     default:
       return false;
   }
 }
Exemple #8
0
  private boolean equalsFormat(HTMLNode node1, HTMLNode node2) {
    Name name1 = node1.getName();
    Name name2 = node2.getName();
    if (name1 != name2) return false;
    Attributes attributes1 = node1.getAttributes();
    Attributes attributes2 = node2.getAttributes();
    if (attributes1.size() != attributes2.size()) return false;
    for (int i = 0; i < attributes1.size(); i++) {
      Attribute attribute1 = attributes1.get(i);
      Attribute attribute2 = attributes2.get(i);
      if (attribute1 == null && attribute2 != null) return false;
      if (attribute1 != null && attribute2 == null) return false;
      if (attribute1 != null
          && attribute2 != null
          && !attribute1.getName().equalsIgnoreCase(attribute2.getName())) return false;
    }

    List<HTMLNode> children1 = node1.getChildren();
    List<HTMLNode> children2 = node2.getChildren();
    if (children1 == null && children2 == null) return true;
    if (children1 == null && children2 != null) return false;
    if (children1 != null && children2 == null) return false;
    if (children1.size() != children2.size()) return false;
    for (int i = 0; i < children1.size(); i++) {
      if (!equalsFormat(children1.get(i), children2.get(i))) return false;
    }
    return true;
  }
  public static void main(String[] args) throws Exception {
    String address = "http://vnexpress.net/GL/Xa-hoi/2009/02/3BA0B4AB/";
    webClient.setURL(address, new URL(address));
    //  String address  = "http://vnmedia.vn/newsdetail.asp?NewsId=154558&CatId=58";
    java.net.URL url = new java.net.URL(address);
    HTMLDocument document = HTMLParser.createDocument(loadContent(address), "utf-8");

    RefsDecoder decoder = new RefsDecoder();
    NodeIterator iterator = document.getRoot().iterator();
    while (iterator.hasNext()) {
      HTMLNode node = iterator.next();
      if (!node.isNode(Name.CONTENT)) continue;
      char[] chars = node.getValue();
      chars = decoder.decode(chars);

      chars = CharsUtil.cutAndTrim(chars, 0, chars.length);
      chars = java.text.Normalizer.normalize(new String(chars), Normalizer.Form.NFC).toCharArray();
      node.setValue(chars);
    }

    loadCSS(address, document);

    NodePath nodePath = pathParser.toPath("BODY");
    HTMLNode body = extractor.lookNode(document.getRoot(), nodePath);

    WebPageDataSearcher dataSearcher = new WebPageDataSearcher(document);
    HTMLNode node = dataSearcher.search(body);

    File file = new File("F:\\Temp2\\web\\output\\extract.htm");
    byte[] bytes = new byte[0];
    if (node != null) bytes = node.getTextValue().getBytes(Application.CHARSET);
    RWData.getInstance().save(file, bytes);
  }
Exemple #10
0
  private boolean isLinkDiv(HTMLNode node) {
    if (node.getChildren() == null) return false;

    List<HTMLNode> ignores = new ArrayList<HTMLNode>();
    NodeIterator iterator = node.iterator();
    while (iterator.hasNext()) {
      HTMLNode n = iterator.next();
      if (n.isNode(Name.A) && !linkNodeChecker.isValid(new CheckModel(n), 0)) ignores.add(n);
    }

    int counter = 0;
    iterator = node.iterator(ignores);
    while (iterator.hasNext()) {
      HTMLNode n = iterator.next();
      if (n.isNode(Name.CONTENT)) {
        counter += countWord(n);
      }
    }
    return counter < 5 && ignores.size() > 1;
  }
Exemple #11
0
 private HTMLNode getAncestor(HTMLNode node, Name name, int level, int max) {
   if (level > max || node == null) return null;
   if (node.isNode(name)) return node;
   return getAncestor(node.getParent(), name, level + 1, max);
 }