public Iterable<Resource> extractResources(String sourceUrl, String html) { Set<Resource> resources = Sets.newHashSet(); String prefixForInternalLinks = URLHandler.createPrefixForInternalLinks(sourceUrl); List<Element> elements = new ArrayList<Element>(); Document doc = Jsoup.parse(html); Elements scripts = doc.select("script"); elements.addAll(doc.select("iframe[src]")); elements.addAll(doc.select("link[href]")); elements.addAll(doc.select("img[src]")); elements.addAll(scripts); String uri; for (Element element : elements) { uri = element.attr("src").trim(); if (!uri.contains(".")) { uri = element.attr("href").trim(); } if (uri.contains(".")) { uri = URLHandler.expandIfInternalLink(prefixForInternalLinks, uri); try { uri = URLHandler.extractHost(uri); resources.add(new Resource(uri, type(element.tag().toString()))); } catch (MalformedURLException e) { if (LOG.isWarnEnabled()) { LOG.warn("Malformed URL: \"" + uri + "\""); } } } } List<String> javaScriptUrlCandidates = new ArrayList<String>(); for (Element script : scripts) { try { String scriptContents = script.data(); if (scriptContents.length() > 1) { ParserRunner.ParseResult parseResult = javascriptParser.parse(scriptContents); findUrlCandidates(parseResult.ast, javaScriptUrlCandidates); } } catch (Exception e) { } } List<String> splittedUrlCandidates = findUrlsInCode(javaScriptUrlCandidates); resources.addAll(resourcesFromCandidates(splittedUrlCandidates)); return resources; }
/** * Get the combined data of this element. Data is e.g. the inside of a {@code script} tag. * * @return the data, or empty string if none * @see #dataNodes() */ public String data() { StringBuilder sb = new StringBuilder(); for (Node childNode : childNodes) { if (childNode instanceof DataNode) { DataNode data = (DataNode) childNode; sb.append(data.getWholeData()); } else if (childNode instanceof Element) { Element element = (Element) childNode; String elementData = element.data(); sb.append(elementData); } } return sb.toString(); }