예제 #1
0
  private void buildUrlSets(String url) {

    try {
      outputMessage("\nFetching " + url, TEST_SUMMARY_MESSAGE);
      URL srcUrl = new URL(url);
      //       URLConnection conn = srcUrl.openConnection();
      //       String type = conn.getContentType();
      //       type = conn.getHeaderField("content-type");
      //       InputStream istr = conn.getInputStream();

      LockssUrlConnection conn = UrlUtil.openConnection(url, connectionPool);
      if (proxyHost != null) {
        conn.setProxy(proxyHost, proxyPort);
      }
      if (userAgent != null) {
        conn.setRequestProperty("user-agent", userAgent);
      }
      try {
        conn.execute();
        int resp = conn.getResponseCode();
        if (resp != 200) {
          outputMessage("Resp: " + resp + ": " + conn.getResponseMessage(), TEST_SUMMARY_MESSAGE);
          return;
        }
        depth_fetched[m_curDepth - 1]++;
        String cookies = conn.getResponseHeaderValue("Set-Cookie");
        if (cookies != null) {
          outputMessage("Cookies: " + cookies, PLAIN_MESSAGE);
        }
        String type = conn.getResponseContentType();
        if (type == null || !type.toLowerCase().startsWith("text/html")) {
          outputMessage("Type: " + type + ", not parsing", URL_SUMMARY_MESSAGE);
          return;
        }
        outputMessage("Type: " + type + ", extracting Urls", URL_SUMMARY_MESSAGE);
        InputStream istr = conn.getResponseInputStream();
        InputStreamReader reader = new InputStreamReader(istr);
        //       MyMockCachedUrl mcu = new MyMockCachedUrl(srcUrl.toString(), reader);
        GoslingHtmlLinkExtractor extractor = new GoslingHtmlLinkExtractor();
        extractor.extractUrls(null, istr, null, srcUrl.toString(), new MyLinkExtractorCallback());
        istr.close();
        depth_parsed[m_curDepth - 1]++;
      } finally {
        conn.release();
      }
    } catch (MalformedURLException murle) {
      murle.printStackTrace();
      outputErrResults(url, "Malformed URL:" + murle.getMessage());
    } catch (IOException ex) {
      ex.printStackTrace();
      outputErrResults(url, "IOException: " + ex.getMessage());
    }
  }
예제 #2
0
 private ExternalizableMap loadMap(String extMapName, ClassLoader loader)
     throws FileNotFoundException {
   String first = null;
   String next = extMapName;
   List<String> urls = new ArrayList<String>();
   ExternalizableMap res = null;
   while (next != null) {
     // convert the plugin class name to an xml file name
     String mapFile = next.replace('.', '/') + MAP_SUFFIX;
     URL url = loader.getResource(mapFile);
     if (url != null && urls.contains(url.toString())) {
       throw new PluginException.InvalidDefinition("Plugin inheritance loop: " + next);
     }
     // load into map
     ExternalizableMap oneMap = new ExternalizableMap();
     oneMap.loadMapFromResource(mapFile, loader);
     urls.add(url.toString());
     // apply overrides one plugin at a time in inheritance chain
     processOverrides(oneMap);
     if (res == null) {
       res = oneMap;
     } else {
       for (Map.Entry ent : oneMap.entrySet()) {
         String key = (String) ent.getKey();
         Object val = ent.getValue();
         if (!res.containsKey(key)) {
           res.setMapElement(key, val);
         }
       }
     }
     if (oneMap.containsKey(KEY_PLUGIN_PARENT)) {
       next = oneMap.getString(KEY_PLUGIN_PARENT);
     } else {
       next = null;
     }
   }
   loadedFromUrls = urls;
   return res;
 }
 /**
  * mapUrlToFileLocation() is the method used to resolve urls into file names. This maps a given
  * url to a file location, using the au top directory as the base. It creates directories which
  * mirror the html string, so 'http://www.journal.org/issue1/index.html' would be cached in the
  * file: <rootLocation>/www.journal.org/http/issue1/index.html
  *
  * @param rootLocation the top directory for ArchivalUnit this URL is in
  * @param urlStr the url to translate
  * @return the url file location
  * @throws java.net.MalformedURLException
  */
 public static String mapUrlToFileLocation(String rootLocation, String urlStr)
     throws MalformedURLException {
   int totalLength = rootLocation.length() + urlStr.length();
   URL url = new URL(urlStr);
   StringBuilder buffer = new StringBuilder(totalLength);
   buffer.append(rootLocation);
   if (!rootLocation.endsWith(File.separator)) {
     buffer.append(File.separator);
   }
   buffer.append(url.getHost().toLowerCase());
   int port = url.getPort();
   if (port != -1) {
     buffer.append(PORT_SEPARATOR);
     buffer.append(port);
   }
   buffer.append(File.separator);
   buffer.append(url.getProtocol());
   if (RepositoryManager.isEnableLongComponents()) {
     String escapedPath =
         escapePath(
             StringUtil.replaceString(url.getPath(), UrlUtil.URL_PATH_SEPARATOR, File.separator));
     String query = url.getQuery();
     if (query != null) {
       escapedPath = escapedPath + "?" + escapeQuery(query);
     }
     String encodedPath = RepositoryNodeImpl.encodeUrl(escapedPath);
     // encodeUrl strips leading / from path
     buffer.append(File.separator);
     buffer.append(encodedPath);
   } else {
     buffer.append(
         escapePath(
             StringUtil.replaceString(url.getPath(), UrlUtil.URL_PATH_SEPARATOR, File.separator)));
     String query = url.getQuery();
     if (query != null) {
       buffer.append("?");
       buffer.append(escapeQuery(query));
     }
   }
   return buffer.toString();
 }