public Iterable<Resource> extractResources(String sourceUrl, String html) { Set<Resource> resources = Sets.newHashSet(); String prefixForInternalLinks = URLHandler.createPrefixForInternalLinks(sourceUrl); List<Element> elements = new ArrayList<Element>(); Document doc = Jsoup.parse(html); Elements scripts = doc.select("script"); elements.addAll(doc.select("iframe[src]")); elements.addAll(doc.select("link[href]")); elements.addAll(doc.select("img[src]")); elements.addAll(scripts); String uri; for (Element element : elements) { uri = element.attr("src").trim(); if (!uri.contains(".")) { uri = element.attr("href").trim(); } if (uri.contains(".")) { uri = URLHandler.expandIfInternalLink(prefixForInternalLinks, uri); try { uri = URLHandler.extractHost(uri); resources.add(new Resource(uri, type(element.tag().toString()))); } catch (MalformedURLException e) { if (LOG.isWarnEnabled()) { LOG.warn("Malformed URL: \"" + uri + "\""); } } } } List<String> javaScriptUrlCandidates = new ArrayList<String>(); for (Element script : scripts) { try { String scriptContents = script.data(); if (scriptContents.length() > 1) { ParserRunner.ParseResult parseResult = javascriptParser.parse(scriptContents); findUrlCandidates(parseResult.ast, javaScriptUrlCandidates); } } catch (Exception e) { } } List<String> splittedUrlCandidates = findUrlsInCode(javaScriptUrlCandidates); resources.addAll(resourcesFromCandidates(splittedUrlCandidates)); return resources; }
private Set<Resource> resourcesFromCandidates(List<String> candidateUrls) { Set<Resource> resources = Sets.newHashSet(); for (String url : candidateUrls) { try { url = URLHandler.extractHost(url); resources.add(new Resource(url, Resource.Type.SCRIPT)); } catch (MalformedURLException e) { if (LOG.isWarnEnabled()) { LOG.warn("Malformed URL: \"" + url + "\""); } } } return resources; }