Exemple #1
0
  public Worker(String url, boolean verbose) throws Exception {
    Document doc;
    doc = Jsoup.connect(url).get();
    // select anchors with href only
    Elements links = doc.select("a[href]");
    String l_Href;
    String host;
    int linksNum;
    Parser parser;
    for (Element link : links) {
      // absolute = http:// added
      l_Href = link.attr("abs:href");
      if (!l_Href.isEmpty()) {
        parser = new Parser(l_Href);
        host = parser.getHost();
        // if tempStats contains the url, add one to the value
        if (tempStats.containsKey(host)) {
          linksNum = tempStats.get(host);
          tempStats.put(host, linksNum += 1);
        }
        // if it doesn't, add it

        else {
          tempStats.put(host, 1);
        }
        // parse the url
        tempQueue.add(parser.getURL());
      }
    }
    if (verbose) {
      System.out.println(
          Thread.currentThread().getName() + " : " + tempQueue.size() + " links from " + url);
    }
  }
Exemple #2
0
  public static List genSitemap(String mapUrl, String base) {
    try {
      Document doc = Jsoup.connect(mapUrl).get();
      Elements links = doc.select("a");
      Elements imgs = doc.select("img");
      List<String> stringLinks = new ArrayList<String>();
      for (Element link : links) {
        stringLinks.add(link.attr("abs:href"));
      }

      Iterator<String> domIt = stringLinks.iterator(); // filter out links to external domains
      while (domIt.hasNext()) {
        String incDom = domIt.next();
        boolean domTest;
        domTest = incDom.contains(base);
        if (domTest == false) {
          domIt.remove();
        }
      }
      Iterator<String> i = stringLinks.iterator();
      while (i.hasNext()) { // remove index.html from incoming links prevents infinite loop
        String incA = i.next();
        if (incA.contains("index")) {
          i.remove();
        }
      }

      return stringLinks;
    } catch (Exception e) {
      // System.out.println(e);
      return null;
    }
  }
Exemple #3
0
 private static void accumulateParents(Element el, Elements parents) {
   Element parent = el.parent();
   if (parent != null && !parent.tagName().equals("#root")) {
     parents.add(parent);
     accumulateParents(parent, parents);
   }
 }
Exemple #4
0
 static boolean preserveWhitespace(Node node) {
   // looks only at this element and one level up, to prevent recursion & needless stack searches
   if (node != null && node instanceof Element) {
     Element element = (Element) node;
     return element.tag.preserveWhitespace()
         || element.parent() != null && element.parent().tag.preserveWhitespace();
   }
   return false;
 }
Exemple #5
0
 /**
  * Test if this element has any text content (that is not just whitespace).
  *
  * @return true if element has non-blank text content.
  */
 public boolean hasText() {
   for (Node child : childNodes) {
     if (child instanceof TextNode) {
       TextNode textNode = (TextNode) child;
       if (!textNode.isBlank()) return true;
     } else if (child instanceof Element) {
       Element el = (Element) child;
       if (el.hasText()) return true;
     }
   }
   return false;
 }
Exemple #6
0
  /**
   * Get the combined data of this element. Data is e.g. the inside of a {@code script} tag.
   *
   * @return the data, or empty string if none
   * @see #dataNodes()
   */
  public String data() {
    StringBuilder sb = new StringBuilder();

    for (Node childNode : childNodes) {
      if (childNode instanceof DataNode) {
        DataNode data = (DataNode) childNode;
        sb.append(data.getWholeData());
      } else if (childNode instanceof Element) {
        Element element = (Element) childNode;
        String elementData = element.data();
        sb.append(elementData);
      }
    }
    return sb.toString();
  }
Exemple #7
0
 public static List getImgs(String mapUrl) {
   try {
     Document doc = Jsoup.connect(mapUrl).get();
     Elements imgs = doc.select("img");
     List<String> stringImgs = new ArrayList<String>();
     stringImgs.add(mapUrl);
     for (Element img : imgs) {
       String imgSrc = img.attr("abs:src");
       if (imgSrc.contains("paypal") == false) stringImgs.add(imgSrc);
     }
     return stringImgs;
   } catch (Exception e) {
     System.out.println(e);
     return null;
   }
 }
        public int walk(Element el) {
          Elements children = el.children();
          String tagName = el.tagName().toLowerCase();

          if (tagName.matches("h[1-6]")) {
            secIndex++;
            String secName = el.text();
            String key =
                "SectionTitle" + "/" + title + "/" + hash + "/" + new Integer(secIndex).toString();
            String value = secNameFilter(secName);
            dataStore.put(key, value);
          }

          for (Element child : children) {
            secIndex = (new Walker(secIndex)).walk(child);
          }

          return secIndex;
        }
Exemple #9
0
  public Scraper() {

    Document doc = null;

    try {
      doc =
          Jsoup.connect(
                  "http://www.geog.leeds.ac.uk/courses/other/programming/practicals/general/web/scraping-intro/table.html")
              .get();
    } catch (IOException ioe) {
      ioe.printStackTrace();
    }
    Element table = doc.getElementById("datatable");
    Elements rows = table.getElementsByTag("TR");

    for (Element row : rows) {
      Elements tds = row.getElementsByTag("TD");
      for (int i = 0; i < tds.size(); i++) {
        if (i == 1) System.out.println(tds.get(i).text());
      }
    }
  }
  public static void main(String[] args) {
    Document doc = null;
    try {
      // get page
      doc =
          (Document)
              Jsoup.connect("http://fskm.uitm.edu.my/v1/fakulti/staff-directory/academic/1097.html")
                  .get();
    } catch (IOException ex) {
      ex.printStackTrace();
    }

    // Get Element with specific ID
    Element table = doc.getElementById("mytable");

    // Get text inside Element
    Elements rows = table.getElementsByTag("TR");
    for (Element row : rows) {
      Elements tds = row.getElementsByTag("TD");
      for (int i = 0; i < tds.size(); i++) {
        if (i == 1) System.out.println(tds.get(i).text());
      }
    }
  }
 @Override
 public List<HSDeck> getDeckListFiltered(final DeckBrowserRequest deckBrowserRequest) {
   final List<HSPlayerClass> classFilter = deckBrowserRequest.getClassFilter();
   final ArrayList<HSDeck> list = new ArrayList<HSDeck>();
   try {
     String s2;
     final String s = s2 = HPDeckSource.BASE_URL + HPDeckSource.DECKS_URL;
     if (deckBrowserRequest.getSortingKey() != null) {
       s2 = s;
       if (!deckBrowserRequest.getSortingKey().trim().isEmpty()) {
         s2 =
             s
                 + "&"
                 + HP_REQUEST_PARAMS.FILTER_OPTION.requestParam
                 + deckBrowserRequest.getSortingKey();
       }
     }
     String string = s2;
     if (deckBrowserRequest.getDeckNameFilter() != null) {
       string = s2;
       if (!deckBrowserRequest.getDeckNameFilter().trim().isEmpty()) {
         string =
             s2
                 + "&"
                 + HP_REQUEST_PARAMS.FILTER_SEARCH.requestParam
                 + this.constructDeckNameFilter(deckBrowserRequest.getDeckNameFilter());
       }
     }
     String string2 = string;
     if (classFilter != null) {
       string2 = string;
       if (classFilter.size() > 0) {
         string2 = string;
         if (!classFilter.contains(HSPlayerClass.ALL)) {
           int n = 0;
           for (final HSPlayerClass hsPlayerClass : classFilter) {
             if (hsPlayerClass.isSingleClass()) {
               n += hsPlayerClass.getHsFilterValue();
             }
           }
           string2 = string + "&" + HP_REQUEST_PARAMS.FILTER_CLASS.requestParam + n;
         }
       }
     }
     String string3 = string2;
     if (deckBrowserRequest.getOrderBy() != null) {
       string3 = string2;
       if (!deckBrowserRequest.getOrderBy().isEmpty()) {
         String s3;
         if (deckBrowserRequest.isAsc()) {
           s3 = "";
         } else {
           s3 = "-";
         }
         string3 =
             string2
                 + "&"
                 + HP_REQUEST_PARAMS.FILTER_SORT.requestParam
                 + s3
                 + deckBrowserRequest.getOrderBy();
       }
     }
     final Elements select =
         Jsoup.connect(string3)
             .referrer(HPDeckSource.BASE_URL + "/")
             .followRedirects(true)
             .ignoreHttpErrors(true)
             .get()
             .select("table#decks tr");
     for (int i = 1; i < select.size() - 1; ++i) {
       final Element value = select.get(i);
       final Elements select2 = value.select("td.col-name span.tip a");
       final Elements select3 = value.select("td.col-deck-type");
       final Elements select4 = value.select("td.col-class");
       final Elements select5 = value.select("td.col-ratings div.rating-sum");
       final Elements select6 = value.select("td.col-dust-cost");
       final Elements select7 = value.select("td.col-updated abbr");
       final HSDeck hsDeck = new HSDeck();
       hsDeck.setName(select2.get(0).text());
       hsDeck.setUrl(select2.get(0).attr("href"));
       hsDeck.setType(select3.get(0).text());
       hsDeck.setPlayerClass(select4.get(0).text());
       hsDeck.setRating(select5.get(0).text());
       hsDeck.setCost(select6.get(0).text());
       if (select7.get(0).hasAttr("data-epoch")) {
         hsDeck.setLastUpdate(select7.get(0).attributes().get("data-epoch"));
       }
       hsDeck.setLastUpdateAsString(select7.get(0).text());
       list.add(hsDeck);
     }
   } catch (IOException ex) {
     ex.printStackTrace();
   }
   return list;
 }
Exemple #12
0
 @Override
 public Element clone() {
   Element clone = (Element) super.clone();
   clone.classNames = null; // derived on first hit, otherwise gets a pointer to source classnames
   return clone;
 }