protected void writeCdxResponse( CDXWriter responseWriter, CloseableIterator<String> cdx, int readLimit, CDXQuery query, AuthToken authToken, CDXAccessFilter accessChecker) { BaseProcessor outputProcessor = responseWriter; if (query.limit < 0) { query.limit = Math.min(-query.limit, readLimit); outputProcessor = new LastNLineProcessor(outputProcessor, query.limit); } else if (query.limit == 0) { query.limit = readLimit; } else { query.limit = Math.min(query.limit, readLimit); } if (!query.closest.isEmpty() && query.isSortClosest()) { outputProcessor = new ClosestTimestampSorted(outputProcessor, query.closest, query.limit); } // Experimental if (query.resolveRevisits) { if (query.isReverse()) { outputProcessor = new ReverseRevisitResolver(outputProcessor, query.showDupeCount); } else { outputProcessor = new ForwardRevisitResolver(outputProcessor, query.showDupeCount); } } else if (query.showDupeCount) { outputProcessor = new DupeCountProcessor(outputProcessor, true); } if (query.showGroupCount || query.showUniqCount) { outputProcessor = new GroupCountProcessor(outputProcessor, query.lastSkipTimestamp, query.showUniqCount); } if (query.collapseTime > 0) { if (collapseToLast) { outputProcessor = new DupeTimestampLastBestStatusFilter( outputProcessor, query.collapseTime, noCollapsePrefix); } else { outputProcessor = new DupeTimestampBestStatusFilter( outputProcessor, query.collapseTime, noCollapsePrefix); } } FieldSplitFormat parseFormat = outputProcessor.modifyOutputFormat(cdxLineFactory.getParseFormat()); FieldRegexFilter filterMatcher = null; if (query.filter != null && (query.filter.length > 0)) { filterMatcher = new FieldRegexFilter(query.filter, parseFormat); } CollapseFieldFilter collapser = null; if (query.collapse != null && (query.collapse.length > 0)) { collapser = new CollapseFieldFilter(query.collapse, parseFormat); } // CDXLine prev = null; CDXLine line = null; // boolean prevUrlAllowed = true; FieldSplitFormat outputFields = null; if (!authChecker.isAllCdxFieldAccessAllowed(authToken)) { outputFields = this.authChecker.getPublicCdxFormat(); } if (!query.fl.isEmpty()) { if (outputFields == null) { outputFields = parseFormat; } try { outputFields = outputFields.createSubset(URLDecoder.decode(query.fl, "UTF-8")); } catch (UnsupportedEncodingException e) { } } else if (outputFields != null) { outputFields = parseFormat.createSubset(outputFields); } outputProcessor.begin(); int writeCount = 0; long allCount = 0; int writeLimit = query.limit; while (cdx.hasNext() && ((writeLimit == 0) || (writeCount < writeLimit)) && (allCount < readLimit) && !responseWriter.isAborted()) { String rawLine = cdx.next(); allCount++; if (query.offset > 0) { --query.offset; continue; } // prev = line; // line = new CDXLine(rawLine, parseFormat); line = this.cdxLineFactory.createStandardCDXLine(rawLine, parseFormat); // TODO: better way to handle this special case? if (line.getMimeType().equals("alexa/dat")) { continue; } // Additional access check, per capture if (accessChecker != null) { if (!accessChecker.includeCapture(line)) { continue; } } // if (!authChecker.isAllUrlAccessAllowed(authToken)) { // if ((query.matchType != MatchType.exact) && ((prev == null) || // !line.getUrlKey().equals(prev.getUrlKey()))) { // prevUrlAllowed = authChecker.isUrlAllowed(line.getOriginalUrl(), authToken); // } // // if (!prevUrlAllowed) { // continue; // } // } // // if (!authChecker.isCaptureAllowed(line, authToken)) { // continue; // } // outputProcessor.trackLine(line); // Timestamp Range Filtering String timestamp = line.getTimestamp(); if (!query.from.isEmpty() && (timestamp.compareTo(query.from) < 0)) { continue; } if (!query.to.isEmpty() && (timestamp.compareTo(query.to) > 0) && !timestamp.startsWith(query.to)) { if (query.matchType == MatchType.exact) { break; } else { continue; } } // Check regex matcher if it exists if ((filterMatcher != null) && !filterMatcher.include(line)) { continue; } // Check collapser if ((collapser != null) && !collapser.include(line)) { continue; } // Filter to only include output fields if (outputFields != null) { line = new CDXLine(line, outputFields); } writeCount += outputProcessor.writeLine(line); if (Thread.interrupted()) { break; } } if (query.showResumeKey && (line != null) && (writeLimit > 0) && (writeCount >= writeLimit)) { StringBuilder sb = new StringBuilder(); sb.append(line.getUrlKey()); sb.append(' '); sb.append(UrlSurtRangeComputer.incLastChar(line.getTimestamp())); String resumeKey; try { resumeKey = URLEncoder.encode(sb.toString(), "UTF-8"); outputProcessor.writeResumeKey(resumeKey); } catch (UnsupportedEncodingException e) { } } outputProcessor.end(); }
public void getCdx(CDXQuery query, AuthToken authToken, CDXWriter responseWriter) throws IOException { CloseableIterator<String> iter = null; try { // Check for wildcards as shortcuts for matchType if (query.matchType == null) { if (query.url.startsWith("*.")) { query.matchType = MatchType.domain; query.url = query.url.substring(2); } else if (query.url.endsWith("*")) { query.matchType = MatchType.prefix; query.url = query.url.substring(0, query.url.length() - 1); } else { query.matchType = MatchType.exact; } } CDXAccessFilter accessChecker = null; if (!authChecker.isAllUrlAccessAllowed(authToken)) { accessChecker = authChecker.createAccessFilter(authToken); } // // For now, don't support domain or host output w/o key as access check is too slow // if (query.matchType == MatchType.domain || query.matchType == MatchType.host) { // if (!authChecker.isAllUrlAccessAllowed(authToken)) { // return; // } // } String startEndUrl[] = urlSurtRangeComputer.determineRange(query.url, query.matchType, "", ""); if (startEndUrl == null) { responseWriter.printError( "Sorry, matchType=" + query.matchType.name() + " is not supported by this server"); return; } if ((accessChecker != null) && !accessChecker.includeUrl(startEndUrl[0], query.url)) { if (query.showNumPages) { // Default to 1 page even if no results responseWriter.printNumPages(1, false); } return; } if (query.last || query.limit == -1) { query.limit = 1; query.setSort(SortType.reverse); } int maxLimit; if (query.fastLatest == null) { // Optimize: default fastLatest to true for last line or closest // sorted results if ((query.limit == -1) || (!query.closest.isEmpty() && (query.limit > 0))) { query.fastLatest = true; } else { query.fastLatest = false; } } // Paged query if (query.page >= 0 || query.showNumPages) { iter = createPagedCdxIterator(startEndUrl, query, authToken, responseWriter); if (iter == null) { return; } // Page size determines the max limit here maxLimit = Integer.MAX_VALUE; } else { // Non-Paged Merged query iter = createBoundedCdxIterator(startEndUrl, query, null, null); // TODO: apply collection-view filtering here. It should happen separately // from exclusion check. We'd need to parse CDX lines into CDXLine object // before passing it to writeCdxResponse(). Pass CDXFilter to getCdx()? // Pass CDX source object that escapsulates collection-view filtering? maxLimit = this.queryMaxLimit; } writeCdxResponse(responseWriter, iter, maxLimit, query, authToken, accessChecker); } catch (URIException e) { responseWriter.printError(e.toString()); } catch (URISyntaxException e) { responseWriter.printError(e.toString()); } finally { if (iter != null) { iter.close(); } } }