Java Parser Exemples, org.htmlparser.util.Parser Java Exemples

Exemple #1

0

Afficher le fichier

Fichier : RegistryArchivalUnit.java Projet : fkautz/cmusv-practicum

 // If there is a <title> element on the start page, use that as our AU
 // name.
 String recomputeRegName() {
   if (!isStarted()) {
     // This can get invoked (seveeral times, mostly from logging) before
     // enough mechanism has started to make it possible to resolve the CuUrl
     // below.
     return null;
   }
   try {
     CachedUrl cu = makeCachedUrl(m_registryUrl);
     if (cu == null) return null;
     URL cuUrl = CuUrl.fromCu(cu);
     Parser parser = new Parser(cuUrl.toString());
     NodeList nodelst = parser.extractAllNodesThatMatch(new NodeClassFilter(TitleTag.class));
     Node nodes[] = nodelst.toNodeArray();
     recomputeRegName = false;
     if (nodes.length < 1) return null;
     // Get the first title found
     TitleTag tag = (TitleTag) nodes[0];
     if (tag == null) return null;
     return tag.getTitle();
   } catch (MalformedURLException e) {
     log.warning("recomputeRegName", e);
     return null;
   } catch (ParserException e) {
     if (e.getThrowable() instanceof FileNotFoundException) {
       log.warning("recomputeRegName: " + e.getThrowable().toString());
     } else {
       log.warning("recomputeRegName", e);
     }
     return null;
   }
 }

Exemple #2

0

Afficher le fichier

Fichier : HtmlParserTool.java Projet : yuexuahandao/spiker

  // 获取一个网站上的链接,filter 用来过滤链接
  public static Set<String> extracLinks(String url, NodeFilter filter) {
    Set<String> links = new HashSet<String>();
    try {
      Parser parser = new Parser(url);
      parser.setEncoding("UTF-8");

      @SuppressWarnings("serial")
      NodeFilter frameFilter =
          new NodeFilter() {
            public boolean accept(Node node) {
              if (node.getText().startsWith("frame src=")) {
                return true;
              } else {
                return false;
              }
            }
          };

      OrFilter linkFilter = new OrFilter(new NodeClassFilter(LinkTag.class), frameFilter);

      NodeList list = parser.extractAllNodesThatMatch(linkFilter);

      System.out.println("length=" + list.size());

      for (int i = 0; i < list.size(); i++) {
        Node tag = list.elementAt(i);

        if (tag instanceof LinkTag) { // <a> 标签
          LinkTag link = (LinkTag) tag;
          String linkUrl = link.getLink(); // URL

          /*
           * if (filter.accept(linkUrl)) { links.add(linkUrl); }
           */

          System.out.println("linkUrl=" + linkUrl);

          if (filter.accept(tag)) {
            links.add(linkUrl);
          }
        } else { // <frame> 标签
          // 提取 frame 里 src 属性的链接,如 <frame src="test.html"/>
          String frame = tag.getText();
          int start = frame.indexOf("src=");
          frame = frame.substring(start);
          int end = frame.indexOf(" ");

          if (end == -1) {
            end = frame.indexOf(">");
          }

          String frameUrl = frame.substring(5, end - 1);
          // if (filter.accept(frameUrl)) {
          // links.add(frameUrl);
          // }

          System.out.println("frameUrl=" + frameUrl);

          if (filter.accept(tag)) {
            links.add(frameUrl);
          }
        }
      }

      /*
       * NodeFilter filter = new TagNameFilter("DIV"); NodeList nodes =
       * parser.extractAllNodesThatMatch(filter); if(nodes!=null) { for
       * (int i = 0; i < nodes.size(); i++) { Node textnode = (Node)
       * nodes.elementAt(i);
       * System.out.println("getText:"+textnode.getText());
       * System.out.println
       * ("================================================="); } }
       */
      /*
       * for(NodeIterator i = parser.elements (); i.hasMoreNodes(); ) {
       * Node node = i.nextNode();
       * System.out.println("getText:"+node.getText());
       * System.out.println("getPlainText:"+node.toPlainTextString());
       * System.out.println("toHtml:"+node.toHtml());
       * System.out.println("toHtml(true):"+node.toHtml(true));
       * System.out.println("toHtml(false):"+node.toHtml(false));
       * System.out.println("toString:"+node.toString());
       * System.out.println
       * ("================================================="); }
       */

      /*
       * TextExtractingVisitor visitor = new TextExtractingVisitor();
       * parser.visitAllNodesWith(visitor); String textInPage =
       * visitor.getExtractedText(); System.out.println(textInPage);
       */

    } catch (ParserException e) {
      e.printStackTrace();
    }
    return links;
  }

Exemple #3

0

Afficher le fichier

Fichier : Main.java Projet : damacode/MyTumblr

 private static boolean handleURL(String address) {
   Main.status(String.format("Processing page \"%s\".", address));
   try {
     NodeList posts = getPosts(address);
     if (posts.toNodeArray().length == 0) {
       return false;
     }
     for (Node post_node : posts.toNodeArray()) {
       if (post_node instanceof TagNode) {
         TagNode post = (TagNode) post_node;
         Post new_post = new Post(Long.parseLong(post.getAttribute("id").substring(5)));
         if (!Main.post_post_hash.containsKey(new_post)) {
           NodeList photo_posts = getPhotoPosts(post.getChildren());
           NodeList remarks = getRemarks(photo_posts);
           for (Node node : remarks.toNodeArray()) {
             Matcher matcher = lores.matcher(node.getText());
             String media_url = "";
             if (matcher.find()) {
               media_url = matcher.group();
               media_url = media_url.substring(17, media_url.length() - 1);
             }
             String thumb =
                 media_url.replace(
                     media_url.substring(media_url.lastIndexOf("_"), media_url.lastIndexOf(".")),
                     "_75sq");
             URL thumb_url = new URL(thumb);
             new_post.pictures.add(new Picture(new URL(media_url), thumb_url));
           }
           NodeList photoset_posts = getPhotosetPosts(post.getChildren());
           NodeList iframes = getIFrames(photoset_posts);
           for (Node node : iframes.toNodeArray()) {
             if (node instanceof TagNode) {
               String iframe_url = ((TagNode) node).getAttribute("src");
               Parser parser2 = new Parser(iframe_url);
               NodeList a_list = parser2.extractAllNodesThatMatch(new TagNameFilter("a"));
               Node[] a_array = a_list.toNodeArray();
               Node[] img_array =
                   a_list.extractAllNodesThatMatch(new TagNameFilter("img"), true).toNodeArray();
               String media_url;
               for (int i = 0; i < a_array.length; i++) {
                 media_url = ((TagNode) img_array[i]).getAttribute("src");
                 String thumb =
                     media_url.replace(
                         media_url.substring(
                             media_url.lastIndexOf("_"), media_url.lastIndexOf(".")),
                         "_75sq");
                 URL thumb_url = new URL(thumb);
                 new_post.pictures.add(new Picture(new URL(media_url), thumb_url));
               }
             }
           }
           Main.handlePost(new_post);
         } else {
           new_post = post_post_hash.get(new_post);
           handleNonDownloadPost(new_post);
         }
       }
     }
   } catch (Exception ex) {
     ex.printStackTrace();
     Main.status("Error handling post.");
   }
   return true;
 }