public Iterable<Resource> extractResources(String sourceUrl, String html) {

    Set<Resource> resources = Sets.newHashSet();
    String prefixForInternalLinks = URLHandler.createPrefixForInternalLinks(sourceUrl);

    List<Element> elements = new ArrayList<Element>();

    Document doc = Jsoup.parse(html);
    Elements scripts = doc.select("script");

    elements.addAll(doc.select("iframe[src]"));
    elements.addAll(doc.select("link[href]"));
    elements.addAll(doc.select("img[src]"));
    elements.addAll(scripts);

    String uri;

    for (Element element : elements) {
      uri = element.attr("src").trim();
      if (!uri.contains(".")) {
        uri = element.attr("href").trim();
      }

      if (uri.contains(".")) {
        uri = URLHandler.expandIfInternalLink(prefixForInternalLinks, uri);
        try {
          uri = URLHandler.extractHost(uri);
          resources.add(new Resource(uri, type(element.tag().toString())));
        } catch (MalformedURLException e) {
          if (LOG.isWarnEnabled()) {
            LOG.warn("Malformed URL: \"" + uri + "\"");
          }
        }
      }
    }

    List<String> javaScriptUrlCandidates = new ArrayList<String>();
    for (Element script : scripts) {
      try {
        String scriptContents = script.data();
        if (scriptContents.length() > 1) {
          ParserRunner.ParseResult parseResult = javascriptParser.parse(scriptContents);
          findUrlCandidates(parseResult.ast, javaScriptUrlCandidates);
        }
      } catch (Exception e) {
      }
    }

    List<String> splittedUrlCandidates = findUrlsInCode(javaScriptUrlCandidates);

    resources.addAll(resourcesFromCandidates(splittedUrlCandidates));

    return resources;
  }
Esempio n. 2
0
 public InputStream openURL(String url) throws IOException {
   int idx = url.indexOf(':');
   if (idx > 1) {
     String protocol = url.substring(0, idx);
     String path = url.substring(idx + 1);
     URLHandler urlHandler = getUrlHandler(protocol);
     if (urlHandler != null) {
       return urlHandler.openStream(path);
     }
   }
   return new URL(url).openStream();
 }
 private Set<Resource> resourcesFromCandidates(List<String> candidateUrls) {
   Set<Resource> resources = Sets.newHashSet();
   for (String url : candidateUrls) {
     try {
       url = URLHandler.extractHost(url);
       resources.add(new Resource(url, Resource.Type.SCRIPT));
     } catch (MalformedURLException e) {
       if (LOG.isWarnEnabled()) {
         LOG.warn("Malformed URL: \"" + url + "\"");
       }
     }
   }
   return resources;
 }
  private List<String> findUrlsInCode(List<String> candidateUrls) {

    List<String> urlsInCode = new ArrayList<String>();

    for (String currentString : candidateUrls) {
      String[] splits = currentString.split("\"|'");

      for (String token : splits) {
        String tok = token.trim();
        if (URLHandler.couldBeUrl(tok)) {
          tok = tok.replace("\\.", ".");
          urlsInCode.add(tok);
        }
      }
    }

    return urlsInCode;
  }