public String urlStringToKey(final String urlString) throws URIException { if (urlString.startsWith("dns:")) { return urlString; } String searchUrl = canonicalize(urlString); String scheme = UrlOperations.urlToScheme(searchUrl); if (scheme != null) { searchUrl = searchUrl.substring(scheme.length()); } else { scheme = UrlOperations.HTTP_SCHEME; } if (-1 == searchUrl.indexOf("/")) { searchUrl = scheme + searchUrl + "/"; } else { searchUrl = scheme + searchUrl; } // Custom rules for (CanonicalizationRule rule : getProcessingRules()) { searchUrl = rule.processIfMatches(new CanonicalizationInput(searchUrl)); } // Core rules // TODO: These next few lines look crazy -- need to be reworked.. This // was the only easy way I could find to get the correct unescaping // out of UsableURIs, possible a bug. Definitely needs some TLC in any case, // as building UsableURIs is *not* a cheap operation. // unescape anything that can be: UsableURI tmpURI = null; try { tmpURI = UsableURIFactory.getInstance(searchUrl); } catch (StringIndexOutOfBoundsException e) { LOGGER.warning(e.getMessage() + ": " + searchUrl); return searchUrl; // } catch(URIException e) { // LOGGER.warning(e.getMessage() + ": " + searchUrl); // return searchUrl; } tmpURI.setPath(tmpURI.getPath()); // convert to UsableURI to perform required URI fixup: UsableURI searchURI = UsableURIFactory.getInstance(tmpURI.getURI()); // replace ' ' with '+' (this is only to match Alexa's canonicalization) String newPath = searchURI.getEscapedPath().replace("%20", "+"); // replace multiple consecutive '/'s in the path. while (newPath.contains("//")) { newPath = newPath.replace("//", "/"); } // this would remove trailing a '/' character, unless the path is empty // but we're not going to do this just yet.. // if((newPath.length() > 1) && newPath.endsWith("/")) { // newPath = newPath.substring(0,newPath.length()-1); // } StringBuilder sb = new StringBuilder(searchUrl.length()); sb.append(searchURI.getHostBasename()); // omit port if scheme default: int defaultSchemePort = UrlOperations.schemeToDefaultPort(scheme); if (searchURI.getPort() != defaultSchemePort && searchURI.getPort() != -1) { sb.append(":").append(searchURI.getPort()); } sb.append(newPath); if (searchURI.getEscapedQuery() != null) { sb.append("?").append(searchURI.getEscapedQuery()); } return sb.toString(); }
@Override public void map( Text key, WritableArchiveRecord value, OutputCollector<Text, Text> output, Reporter reporter) throws IOException { ArchiveRecord record = value.getRecord(); ArchiveRecordHeader header = record.getHeader(); // Logging for debug info: log.debug( "Processing @" + header.getOffset() + "+" + record.available() + "," + header.getLength() + ": " + header.getUrl()); for (String h : header.getHeaderFields().keySet()) { log.debug("ArchiveHeader: " + h + " -> " + header.getHeaderValue(h)); } try { MDX mdx = new MDX(); Date crawl_date = ArchiveUtils.parse14DigitISODate(header.getDate(), null); if (crawl_date != null) { mdx.setTs(ArchiveUtils.get14DigitDate(crawl_date)); } else { mdx.setTs(header.getDate()); } mdx.setUrl(header.getUrl()); mdx.setHash(header.getDigest()); // Data from WARC record: mdx.put("source-file", key.toString()); mdx.put("content-type", header.getMimetype()); mdx.put("content-length", "" + header.getContentLength()); mdx.put("length", "" + header.getLength()); mdx.put("source-offset", "" + header.getOffset()); mdx.put("record-identifier", header.getRecordIdentifier()); for (String k : header.getHeaderFieldKeys()) { mdx.put("HEADER-" + k, "" + header.getHeaderValue(k)); } // check record type and look for HTTP data: Header[] httpHeaders = null; if (record instanceof WARCRecord) { mdx.setRecordType("warc." + header.getHeaderValue(HEADER_KEY_TYPE)); mdx.setHash("" + header.getHeaderValue(WARCConstants.HEADER_KEY_PAYLOAD_DIGEST)); // There are not always headers! The code should check first. String statusLine = HttpParser.readLine(record, "UTF-8"); if (statusLine != null && statusLine.startsWith("HTTP")) { String firstLine[] = statusLine.split(" "); if (firstLine.length > 1) { String statusCode = firstLine[1].trim(); mdx.put("status-code", statusCode); try { httpHeaders = HttpParser.parseHeaders(record, "UTF-8"); } catch (ProtocolException p) { log.error( "ProtocolException [" + statusCode + "]: " + header.getHeaderValue(WARCConstants.HEADER_KEY_FILENAME) + "@" + header.getHeaderValue(WARCConstants.ABSOLUTE_OFFSET_KEY), p); } } else { log.warn("Could not parse status line: " + statusLine); } } else { log.warn( "Invalid status line: " + header.getHeaderValue(WARCConstants.HEADER_KEY_FILENAME) + "@" + header.getHeaderValue(WARCConstants.ABSOLUTE_OFFSET_KEY)); } } else if (record instanceof ARCRecord) { mdx.setRecordType("arc"); ARCRecord arcr = (ARCRecord) record; mdx.put("status-code", "" + arcr.getStatusCode()); httpHeaders = arcr.getHttpHeaders(); } else { mdx.setRecordType("unknown"); } // Add in http headers if (httpHeaders != null) { for (Header h : httpHeaders) { mdx.put("HTTP-" + h.getName(), h.getValue()); } } // URL: String uri = header.getUrl(); if (uri != null) { UsableURI uuri = UsableURIFactory.getInstance(uri); // Hosts: if ("https".contains(uuri.getScheme())) { mdx.put("host", uuri.getAuthority()); } } else { mdx.put("errors", "malformed-url"); } // Year String date = header.getDate(); if (date != null && date.length() > 4) { mdx.put("year", date.substring(0, 4)); } else { mdx.put("errors", "malformed-date"); } // And collect: String outKey = mdx.getHash(); if (outKey == null || outKey == "" || "null".equals(outKey)) { outKey = mdx.getRecordType() + ":" + header.getMimetype(); } else { outKey = mdx.getRecordType() + ":" + outKey; } output.collect(new Text(outKey), new Text(mdx.toString())); } catch (JSONException e) { // TODO Auto-generated catch block e.printStackTrace(); } }