public Iterable<Resource> extractResources(String sourceUrl, String html) { Set<Resource> resources = Sets.newHashSet(); String prefixForInternalLinks = URLHandler.createPrefixForInternalLinks(sourceUrl); List<Element> elements = new ArrayList<Element>(); Document doc = Jsoup.parse(html); Elements scripts = doc.select("script"); elements.addAll(doc.select("iframe[src]")); elements.addAll(doc.select("link[href]")); elements.addAll(doc.select("img[src]")); elements.addAll(scripts); String uri; for (Element element : elements) { uri = element.attr("src").trim(); if (!uri.contains(".")) { uri = element.attr("href").trim(); } if (uri.contains(".")) { uri = URLHandler.expandIfInternalLink(prefixForInternalLinks, uri); try { uri = URLHandler.extractHost(uri); resources.add(new Resource(uri, type(element.tag().toString()))); } catch (MalformedURLException e) { if (LOG.isWarnEnabled()) { LOG.warn("Malformed URL: \"" + uri + "\""); } } } } List<String> javaScriptUrlCandidates = new ArrayList<String>(); for (Element script : scripts) { try { String scriptContents = script.data(); if (scriptContents.length() > 1) { ParserRunner.ParseResult parseResult = javascriptParser.parse(scriptContents); findUrlCandidates(parseResult.ast, javaScriptUrlCandidates); } } catch (Exception e) { } } List<String> splittedUrlCandidates = findUrlsInCode(javaScriptUrlCandidates); resources.addAll(resourcesFromCandidates(splittedUrlCandidates)); return resources; }
public InputStream openURL(String url) throws IOException { int idx = url.indexOf(':'); if (idx > 1) { String protocol = url.substring(0, idx); String path = url.substring(idx + 1); URLHandler urlHandler = getUrlHandler(protocol); if (urlHandler != null) { return urlHandler.openStream(path); } } return new URL(url).openStream(); }
private Set<Resource> resourcesFromCandidates(List<String> candidateUrls) { Set<Resource> resources = Sets.newHashSet(); for (String url : candidateUrls) { try { url = URLHandler.extractHost(url); resources.add(new Resource(url, Resource.Type.SCRIPT)); } catch (MalformedURLException e) { if (LOG.isWarnEnabled()) { LOG.warn("Malformed URL: \"" + url + "\""); } } } return resources; }
private List<String> findUrlsInCode(List<String> candidateUrls) { List<String> urlsInCode = new ArrayList<String>(); for (String currentString : candidateUrls) { String[] splits = currentString.split("\"|'"); for (String token : splits) { String tok = token.trim(); if (URLHandler.couldBeUrl(tok)) { tok = tok.replace("\\.", "."); urlsInCode.add(tok); } } } return urlsInCode; }