コード例 #1
0
  public String urlStringToKey(final String urlString) throws URIException {

    if (urlString.startsWith("dns:")) {
      return urlString;
    }
    String searchUrl = canonicalize(urlString);
    String scheme = UrlOperations.urlToScheme(searchUrl);
    if (scheme != null) {
      searchUrl = searchUrl.substring(scheme.length());
    } else {
      scheme = UrlOperations.HTTP_SCHEME;
    }

    if (-1 == searchUrl.indexOf("/")) {
      searchUrl = scheme + searchUrl + "/";
    } else {
      searchUrl = scheme + searchUrl;
    }

    // Custom rules

    for (CanonicalizationRule rule : getProcessingRules()) {
      searchUrl = rule.processIfMatches(new CanonicalizationInput(searchUrl));
    }

    // Core rules

    // TODO: These next few lines look crazy -- need to be reworked.. This
    // was the only easy way I could find to get the correct unescaping
    // out of UsableURIs, possible a bug. Definitely needs some TLC in any case,
    // as building UsableURIs is *not* a cheap operation.

    // unescape anything that can be:
    UsableURI tmpURI = null;
    try {
      tmpURI = UsableURIFactory.getInstance(searchUrl);
    } catch (StringIndexOutOfBoundsException e) {
      LOGGER.warning(e.getMessage() + ": " + searchUrl);
      return searchUrl;
      //		} catch(URIException e) {
      //			LOGGER.warning(e.getMessage() + ": " + searchUrl);
      //			return searchUrl;
    }
    tmpURI.setPath(tmpURI.getPath());

    // convert to UsableURI to perform required URI fixup:
    UsableURI searchURI = UsableURIFactory.getInstance(tmpURI.getURI());

    // replace ' ' with '+' (this is only to match Alexa's canonicalization)
    String newPath = searchURI.getEscapedPath().replace("%20", "+");

    // replace multiple consecutive '/'s in the path.
    while (newPath.contains("//")) {
      newPath = newPath.replace("//", "/");
    }

    // this would remove trailing a '/' character, unless the path is empty
    // but we're not going to do this just yet..
    //		if((newPath.length() > 1) && newPath.endsWith("/")) {
    //			newPath = newPath.substring(0,newPath.length()-1);
    //		}

    StringBuilder sb = new StringBuilder(searchUrl.length());
    sb.append(searchURI.getHostBasename());

    // omit port if scheme default:
    int defaultSchemePort = UrlOperations.schemeToDefaultPort(scheme);
    if (searchURI.getPort() != defaultSchemePort && searchURI.getPort() != -1) {

      sb.append(":").append(searchURI.getPort());
    }

    sb.append(newPath);
    if (searchURI.getEscapedQuery() != null) {
      sb.append("?").append(searchURI.getEscapedQuery());
    }

    return sb.toString();
  }
コード例 #2
0
  @Override
  public void map(
      Text key, WritableArchiveRecord value, OutputCollector<Text, Text> output, Reporter reporter)
      throws IOException {
    ArchiveRecord record = value.getRecord();
    ArchiveRecordHeader header = record.getHeader();

    // Logging for debug info:
    log.debug(
        "Processing @"
            + header.getOffset()
            + "+"
            + record.available()
            + ","
            + header.getLength()
            + ": "
            + header.getUrl());
    for (String h : header.getHeaderFields().keySet()) {
      log.debug("ArchiveHeader: " + h + " -> " + header.getHeaderValue(h));
    }

    try {
      MDX mdx = new MDX();
      Date crawl_date = ArchiveUtils.parse14DigitISODate(header.getDate(), null);
      if (crawl_date != null) {
        mdx.setTs(ArchiveUtils.get14DigitDate(crawl_date));
      } else {
        mdx.setTs(header.getDate());
      }
      mdx.setUrl(header.getUrl());
      mdx.setHash(header.getDigest());

      // Data from WARC record:
      mdx.put("source-file", key.toString());
      mdx.put("content-type", header.getMimetype());
      mdx.put("content-length", "" + header.getContentLength());
      mdx.put("length", "" + header.getLength());
      mdx.put("source-offset", "" + header.getOffset());
      mdx.put("record-identifier", header.getRecordIdentifier());
      for (String k : header.getHeaderFieldKeys()) {
        mdx.put("HEADER-" + k, "" + header.getHeaderValue(k));
      }

      // check record type and look for HTTP data:
      Header[] httpHeaders = null;
      if (record instanceof WARCRecord) {
        mdx.setRecordType("warc." + header.getHeaderValue(HEADER_KEY_TYPE));
        mdx.setHash("" + header.getHeaderValue(WARCConstants.HEADER_KEY_PAYLOAD_DIGEST));
        // There are not always headers! The code should check first.
        String statusLine = HttpParser.readLine(record, "UTF-8");
        if (statusLine != null && statusLine.startsWith("HTTP")) {
          String firstLine[] = statusLine.split(" ");
          if (firstLine.length > 1) {
            String statusCode = firstLine[1].trim();
            mdx.put("status-code", statusCode);
            try {
              httpHeaders = HttpParser.parseHeaders(record, "UTF-8");
            } catch (ProtocolException p) {
              log.error(
                  "ProtocolException ["
                      + statusCode
                      + "]: "
                      + header.getHeaderValue(WARCConstants.HEADER_KEY_FILENAME)
                      + "@"
                      + header.getHeaderValue(WARCConstants.ABSOLUTE_OFFSET_KEY),
                  p);
            }
          } else {
            log.warn("Could not parse status line: " + statusLine);
          }
        } else {
          log.warn(
              "Invalid status line: "
                  + header.getHeaderValue(WARCConstants.HEADER_KEY_FILENAME)
                  + "@"
                  + header.getHeaderValue(WARCConstants.ABSOLUTE_OFFSET_KEY));
        }

      } else if (record instanceof ARCRecord) {
        mdx.setRecordType("arc");
        ARCRecord arcr = (ARCRecord) record;
        mdx.put("status-code", "" + arcr.getStatusCode());
        httpHeaders = arcr.getHttpHeaders();

      } else {
        mdx.setRecordType("unknown");
      }

      // Add in http headers
      if (httpHeaders != null) {
        for (Header h : httpHeaders) {
          mdx.put("HTTP-" + h.getName(), h.getValue());
        }
      }

      // URL:
      String uri = header.getUrl();
      if (uri != null) {
        UsableURI uuri = UsableURIFactory.getInstance(uri);
        // Hosts:
        if ("https".contains(uuri.getScheme())) {
          mdx.put("host", uuri.getAuthority());
        }
      } else {
        mdx.put("errors", "malformed-url");
      }

      // Year
      String date = header.getDate();
      if (date != null && date.length() > 4) {
        mdx.put("year", date.substring(0, 4));
      } else {
        mdx.put("errors", "malformed-date");
      }

      // And collect:
      String outKey = mdx.getHash();
      if (outKey == null || outKey == "" || "null".equals(outKey)) {
        outKey = mdx.getRecordType() + ":" + header.getMimetype();
      } else {
        outKey = mdx.getRecordType() + ":" + outKey;
      }

      output.collect(new Text(outKey), new Text(mdx.toString()));
    } catch (JSONException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }
  }