private void buildUrlSets(String url) { try { outputMessage("\nFetching " + url, TEST_SUMMARY_MESSAGE); URL srcUrl = new URL(url); // URLConnection conn = srcUrl.openConnection(); // String type = conn.getContentType(); // type = conn.getHeaderField("content-type"); // InputStream istr = conn.getInputStream(); LockssUrlConnection conn = UrlUtil.openConnection(url, connectionPool); if (proxyHost != null) { conn.setProxy(proxyHost, proxyPort); } if (userAgent != null) { conn.setRequestProperty("user-agent", userAgent); } try { conn.execute(); int resp = conn.getResponseCode(); if (resp != 200) { outputMessage("Resp: " + resp + ": " + conn.getResponseMessage(), TEST_SUMMARY_MESSAGE); return; } depth_fetched[m_curDepth - 1]++; String cookies = conn.getResponseHeaderValue("Set-Cookie"); if (cookies != null) { outputMessage("Cookies: " + cookies, PLAIN_MESSAGE); } String type = conn.getResponseContentType(); if (type == null || !type.toLowerCase().startsWith("text/html")) { outputMessage("Type: " + type + ", not parsing", URL_SUMMARY_MESSAGE); return; } outputMessage("Type: " + type + ", extracting Urls", URL_SUMMARY_MESSAGE); InputStream istr = conn.getResponseInputStream(); InputStreamReader reader = new InputStreamReader(istr); // MyMockCachedUrl mcu = new MyMockCachedUrl(srcUrl.toString(), reader); GoslingHtmlLinkExtractor extractor = new GoslingHtmlLinkExtractor(); extractor.extractUrls(null, istr, null, srcUrl.toString(), new MyLinkExtractorCallback()); istr.close(); depth_parsed[m_curDepth - 1]++; } finally { conn.release(); } } catch (MalformedURLException murle) { murle.printStackTrace(); outputErrResults(url, "Malformed URL:" + murle.getMessage()); } catch (IOException ex) { ex.printStackTrace(); outputErrResults(url, "IOException: " + ex.getMessage()); } }
private ExternalizableMap loadMap(String extMapName, ClassLoader loader) throws FileNotFoundException { String first = null; String next = extMapName; List<String> urls = new ArrayList<String>(); ExternalizableMap res = null; while (next != null) { // convert the plugin class name to an xml file name String mapFile = next.replace('.', '/') + MAP_SUFFIX; URL url = loader.getResource(mapFile); if (url != null && urls.contains(url.toString())) { throw new PluginException.InvalidDefinition("Plugin inheritance loop: " + next); } // load into map ExternalizableMap oneMap = new ExternalizableMap(); oneMap.loadMapFromResource(mapFile, loader); urls.add(url.toString()); // apply overrides one plugin at a time in inheritance chain processOverrides(oneMap); if (res == null) { res = oneMap; } else { for (Map.Entry ent : oneMap.entrySet()) { String key = (String) ent.getKey(); Object val = ent.getValue(); if (!res.containsKey(key)) { res.setMapElement(key, val); } } } if (oneMap.containsKey(KEY_PLUGIN_PARENT)) { next = oneMap.getString(KEY_PLUGIN_PARENT); } else { next = null; } } loadedFromUrls = urls; return res; }
/** * mapUrlToFileLocation() is the method used to resolve urls into file names. This maps a given * url to a file location, using the au top directory as the base. It creates directories which * mirror the html string, so 'http://www.journal.org/issue1/index.html' would be cached in the * file: <rootLocation>/www.journal.org/http/issue1/index.html * * @param rootLocation the top directory for ArchivalUnit this URL is in * @param urlStr the url to translate * @return the url file location * @throws java.net.MalformedURLException */ public static String mapUrlToFileLocation(String rootLocation, String urlStr) throws MalformedURLException { int totalLength = rootLocation.length() + urlStr.length(); URL url = new URL(urlStr); StringBuilder buffer = new StringBuilder(totalLength); buffer.append(rootLocation); if (!rootLocation.endsWith(File.separator)) { buffer.append(File.separator); } buffer.append(url.getHost().toLowerCase()); int port = url.getPort(); if (port != -1) { buffer.append(PORT_SEPARATOR); buffer.append(port); } buffer.append(File.separator); buffer.append(url.getProtocol()); if (RepositoryManager.isEnableLongComponents()) { String escapedPath = escapePath( StringUtil.replaceString(url.getPath(), UrlUtil.URL_PATH_SEPARATOR, File.separator)); String query = url.getQuery(); if (query != null) { escapedPath = escapedPath + "?" + escapeQuery(query); } String encodedPath = RepositoryNodeImpl.encodeUrl(escapedPath); // encodeUrl strips leading / from path buffer.append(File.separator); buffer.append(encodedPath); } else { buffer.append( escapePath( StringUtil.replaceString(url.getPath(), UrlUtil.URL_PATH_SEPARATOR, File.separator))); String query = url.getQuery(); if (query != null) { buffer.append("?"); buffer.append(escapeQuery(query)); } } return buffer.toString(); }