public Iterable<Resource> extractResources(String sourceUrl, String html) {

    Set<Resource> resources = Sets.newHashSet();
    String prefixForInternalLinks = URLHandler.createPrefixForInternalLinks(sourceUrl);

    List<Element> elements = new ArrayList<Element>();

    Document doc = Jsoup.parse(html);
    Elements scripts = doc.select("script");

    elements.addAll(doc.select("iframe[src]"));
    elements.addAll(doc.select("link[href]"));
    elements.addAll(doc.select("img[src]"));
    elements.addAll(scripts);

    String uri;

    for (Element element : elements) {
      uri = element.attr("src").trim();
      if (!uri.contains(".")) {
        uri = element.attr("href").trim();
      }

      if (uri.contains(".")) {
        uri = URLHandler.expandIfInternalLink(prefixForInternalLinks, uri);
        try {
          uri = URLHandler.extractHost(uri);
          resources.add(new Resource(uri, type(element.tag().toString())));
        } catch (MalformedURLException e) {
          if (LOG.isWarnEnabled()) {
            LOG.warn("Malformed URL: \"" + uri + "\"");
          }
        }
      }
    }

    List<String> javaScriptUrlCandidates = new ArrayList<String>();
    for (Element script : scripts) {
      try {
        String scriptContents = script.data();
        if (scriptContents.length() > 1) {
          ParserRunner.ParseResult parseResult = javascriptParser.parse(scriptContents);
          findUrlCandidates(parseResult.ast, javaScriptUrlCandidates);
        }
      } catch (Exception e) {
      }
    }

    List<String> splittedUrlCandidates = findUrlsInCode(javaScriptUrlCandidates);

    resources.addAll(resourcesFromCandidates(splittedUrlCandidates));

    return resources;
  }
Beispiel #2
0
  /**
   * Get the combined data of this element. Data is e.g. the inside of a {@code script} tag.
   *
   * @return the data, or empty string if none
   * @see #dataNodes()
   */
  public String data() {
    StringBuilder sb = new StringBuilder();

    for (Node childNode : childNodes) {
      if (childNode instanceof DataNode) {
        DataNode data = (DataNode) childNode;
        sb.append(data.getWholeData());
      } else if (childNode instanceof Element) {
        Element element = (Element) childNode;
        String elementData = element.data();
        sb.append(elementData);
      }
    }
    return sb.toString();
  }