コード例 #1
0
ファイル: Sitemap.java プロジェクト: tylerhill/java
  public static List genSitemap(String mapUrl, String base) {
    try {
      Document doc = Jsoup.connect(mapUrl).get();
      Elements links = doc.select("a");
      Elements imgs = doc.select("img");
      List<String> stringLinks = new ArrayList<String>();
      for (Element link : links) {
        stringLinks.add(link.attr("abs:href"));
      }

      Iterator<String> domIt = stringLinks.iterator(); // filter out links to external domains
      while (domIt.hasNext()) {
        String incDom = domIt.next();
        boolean domTest;
        domTest = incDom.contains(base);
        if (domTest == false) {
          domIt.remove();
        }
      }
      Iterator<String> i = stringLinks.iterator();
      while (i.hasNext()) { // remove index.html from incoming links prevents infinite loop
        String incA = i.next();
        if (incA.contains("index")) {
          i.remove();
        }
      }

      return stringLinks;
    } catch (Exception e) {
      // System.out.println(e);
      return null;
    }
  }
コード例 #2
0
ファイル: Worker.java プロジェクト: m1/Parker
  public Worker(String url, boolean verbose) throws Exception {
    Document doc;
    doc = Jsoup.connect(url).get();
    // select anchors with href only
    Elements links = doc.select("a[href]");
    String l_Href;
    String host;
    int linksNum;
    Parser parser;
    for (Element link : links) {
      // absolute = http:// added
      l_Href = link.attr("abs:href");
      if (!l_Href.isEmpty()) {
        parser = new Parser(l_Href);
        host = parser.getHost();
        // if tempStats contains the url, add one to the value
        if (tempStats.containsKey(host)) {
          linksNum = tempStats.get(host);
          tempStats.put(host, linksNum += 1);
        }
        // if it doesn't, add it

        else {
          tempStats.put(host, 1);
        }
        // parse the url
        tempQueue.add(parser.getURL());
      }
    }
    if (verbose) {
      System.out.println(
          Thread.currentThread().getName() + " : " + tempQueue.size() + " links from " + url);
    }
  }
コード例 #3
0
ファイル: Sitemap.java プロジェクト: tylerhill/java
 public static List getImgs(String mapUrl) {
   try {
     Document doc = Jsoup.connect(mapUrl).get();
     Elements imgs = doc.select("img");
     List<String> stringImgs = new ArrayList<String>();
     stringImgs.add(mapUrl);
     for (Element img : imgs) {
       String imgSrc = img.attr("abs:src");
       if (imgSrc.contains("paypal") == false) stringImgs.add(imgSrc);
     }
     return stringImgs;
   } catch (Exception e) {
     System.out.println(e);
     return null;
   }
 }