コード例 #1
0
ファイル: URLCanonicalizer.java プロジェクト: raufbutt/zappy
  /**
   * Gets the canonical url, starting from a relative or absolute url found in a given context
   * (baseURL).
   *
   * @param url the url string defining the reference
   * @param baseURL the context in which this url was found
   * @return the canonical url
   */
  public static String getCanonicalURL(String url, String baseURL) {

    try {
      /* Build the absolute URL, from the url and the baseURL */
      String resolvedURL = URLResolver.resolveUrl(baseURL == null ? "" : baseURL, url);
      log.debug("Resolved URL: " + resolvedURL);
      URI canonicalURI = new URI(resolvedURL);

      /* Some checking. */
      if (canonicalURI.getScheme() == null) {
        throw new MalformedURLException(
            "Protocol could not be reliably evaluated from uri: "
                + canonicalURI
                + " and base url: "
                + baseURL);
      }
      if (canonicalURI.getHost() == null) {
        throw new MalformedURLException(
            "Host could not be reliably evaluated from: " + canonicalURI);
      }

      /*
       * Normalize: no empty segments (i.e., "//"), no segments equal to ".", and no segments equal to
       * ".." that are preceded by a segment not equal to "..".
       */
      String path = canonicalURI.normalize().getRawPath();

      /* Convert '//' -> '/' */
      int idx = path.indexOf("//");
      while (idx >= 0) {
        path = path.replace("//", "/");
        idx = path.indexOf("//");
      }

      /* Drop starting '/../' */
      while (path.startsWith("/../")) {
        path = path.substring(3);
      }

      /* Trim */
      path = path.trim();

      /* Process parameters and sort them. */
      final SortedMap<String, String> params = createParameterMap(canonicalURI.getRawQuery());
      final String queryString;
      String canonicalParams = canonicalize(params);
      queryString = (canonicalParams.isEmpty() ? "" : "?" + canonicalParams);

      /* Add starting slash if needed */
      if (path.length() == 0) {
        path = "/" + path;
      }

      /* Drop default port: example.com:80 -> example.com */
      int port = canonicalURI.getPort();
      if (port == 80) {
        port = -1;
      }

      /* Lowercasing protocol and host */
      String protocol = canonicalURI.getScheme().toLowerCase();
      String host = canonicalURI.getHost().toLowerCase();
      String pathAndQueryString = normalizePath(path) + queryString;

      URL result = new URL(protocol, host, port, pathAndQueryString);
      return result.toExternalForm();

    } catch (MalformedURLException ex) {
      log.warn(
          "Error while Processing URL in the spidering process (on base "
              + baseURL
              + "): "
              + ex.getMessage());
      return null;
    } catch (URISyntaxException ex) {
      log.warn(
          "Error while Processing URI in the spidering process (on base "
              + baseURL
              + "): "
              + ex.getMessage());
      return null;
    }
  }