protected CloseableIterator<String> createBoundedCdxIterator( String[] startEndUrl, CDXQuery query, PageResult pageResult, CloseableIterator<String> idx) throws IOException { String searchKey = null; ZipNumParams params = new ZipNumParams(defaultParams); // Opt: testing out sequential load! if (Math.abs(query.limit) == 1) { params.setSequential(true); } params.setReverse(query.isReverse()); if (!query.resumeKey.isEmpty()) { searchKey = URLDecoder.decode(query.resumeKey, "UTF-8"); startEndUrl[0] = searchKey; // int lastSpace = startEndUrl[0].lastIndexOf(' '); // if (lastSpace > 0) { // startEndUrl[0] = searchKey.substring(0, lastSpace); // } } else if (!query.from.isEmpty()) { searchKey = startEndUrl[0] + " " + query.from; } else if (query.isReverse() && !query.closest.isEmpty()) { searchKey = startEndUrl[0]; startEndUrl[1] = startEndUrl[0] + " " + query.closest; } else if (query.fastLatest) { String endkey = (query.closest.isEmpty() ? "!" : " " + query.closest); params.setMaxAggregateBlocks(1); searchKey = startEndUrl[0] + endkey; } else { searchKey = startEndUrl[0]; } if (pageResult != null) { params.setTimestampDedupLength(0); return zipnumSource.getCDXIterator( idx, searchKey, startEndUrl[1], query.page, pageResult.numPages, params); } else { return cdxSource.getCDXIterator(searchKey, startEndUrl[0], startEndUrl[1], params); } }
protected CloseableIterator<String> createPagedCdxIterator( String[] startEndUrl, CDXQuery query, AuthToken authToken, CDXWriter responseWriter) throws IOException { if (zipnumSource == null) { responseWriter.printError( "Sorry, this server is not configured to support paged query. Remove page= param and try again."); return null; } boolean allAccess = authChecker.isAllUrlAccessAllowed(authToken); if ((query.pageSize <= 0) || ((query.pageSize > maxPageSize) && !allAccess)) { query.pageSize = maxPageSize; } PageResult pageResult = zipnumSource.getNthPage(startEndUrl, query.page, query.pageSize, query.showNumPages); if (query.showNumPages) { responseWriter.printNumPages(pageResult.numPages, true); return null; } else { responseWriter.printNumPages(pageResult.numPages, false); } CloseableIterator<String> iter = pageResult.iter; if (iter == null) { return null; } if (query.isReverse()) { iter = new LineBufferingIterator(iter, query.pageSize, true); } String zipnumClusterUri = zipnumSource.getLocRoot(); if (query.showPagedIndex && allAccess) { responseWriter.setMaxLines(query.pageSize, zipnumClusterUri); writeIdxResponse(responseWriter, iter); return null; } else { responseWriter.setMaxLines( query.pageSize * zipnumSource.getCdxLinesPerBlock(), zipnumClusterUri); } iter = createBoundedCdxIterator(startEndUrl, query, pageResult, iter); return iter; }
protected void writeCdxResponse( CDXWriter responseWriter, CloseableIterator<String> cdx, int readLimit, CDXQuery query, AuthToken authToken, CDXAccessFilter accessChecker) { BaseProcessor outputProcessor = responseWriter; if (query.limit < 0) { query.limit = Math.min(-query.limit, readLimit); outputProcessor = new LastNLineProcessor(outputProcessor, query.limit); } else if (query.limit == 0) { query.limit = readLimit; } else { query.limit = Math.min(query.limit, readLimit); } if (!query.closest.isEmpty() && query.isSortClosest()) { outputProcessor = new ClosestTimestampSorted(outputProcessor, query.closest, query.limit); } // Experimental if (query.resolveRevisits) { if (query.isReverse()) { outputProcessor = new ReverseRevisitResolver(outputProcessor, query.showDupeCount); } else { outputProcessor = new ForwardRevisitResolver(outputProcessor, query.showDupeCount); } } else if (query.showDupeCount) { outputProcessor = new DupeCountProcessor(outputProcessor, true); } if (query.showGroupCount || query.showUniqCount) { outputProcessor = new GroupCountProcessor(outputProcessor, query.lastSkipTimestamp, query.showUniqCount); } if (query.collapseTime > 0) { if (collapseToLast) { outputProcessor = new DupeTimestampLastBestStatusFilter( outputProcessor, query.collapseTime, noCollapsePrefix); } else { outputProcessor = new DupeTimestampBestStatusFilter( outputProcessor, query.collapseTime, noCollapsePrefix); } } FieldSplitFormat parseFormat = outputProcessor.modifyOutputFormat(cdxLineFactory.getParseFormat()); FieldRegexFilter filterMatcher = null; if (query.filter != null && (query.filter.length > 0)) { filterMatcher = new FieldRegexFilter(query.filter, parseFormat); } CollapseFieldFilter collapser = null; if (query.collapse != null && (query.collapse.length > 0)) { collapser = new CollapseFieldFilter(query.collapse, parseFormat); } // CDXLine prev = null; CDXLine line = null; // boolean prevUrlAllowed = true; FieldSplitFormat outputFields = null; if (!authChecker.isAllCdxFieldAccessAllowed(authToken)) { outputFields = this.authChecker.getPublicCdxFormat(); } if (!query.fl.isEmpty()) { if (outputFields == null) { outputFields = parseFormat; } try { outputFields = outputFields.createSubset(URLDecoder.decode(query.fl, "UTF-8")); } catch (UnsupportedEncodingException e) { } } else if (outputFields != null) { outputFields = parseFormat.createSubset(outputFields); } outputProcessor.begin(); int writeCount = 0; long allCount = 0; int writeLimit = query.limit; while (cdx.hasNext() && ((writeLimit == 0) || (writeCount < writeLimit)) && (allCount < readLimit) && !responseWriter.isAborted()) { String rawLine = cdx.next(); allCount++; if (query.offset > 0) { --query.offset; continue; } // prev = line; // line = new CDXLine(rawLine, parseFormat); line = this.cdxLineFactory.createStandardCDXLine(rawLine, parseFormat); // TODO: better way to handle this special case? if (line.getMimeType().equals("alexa/dat")) { continue; } // Additional access check, per capture if (accessChecker != null) { if (!accessChecker.includeCapture(line)) { continue; } } // if (!authChecker.isAllUrlAccessAllowed(authToken)) { // if ((query.matchType != MatchType.exact) && ((prev == null) || // !line.getUrlKey().equals(prev.getUrlKey()))) { // prevUrlAllowed = authChecker.isUrlAllowed(line.getOriginalUrl(), authToken); // } // // if (!prevUrlAllowed) { // continue; // } // } // // if (!authChecker.isCaptureAllowed(line, authToken)) { // continue; // } // outputProcessor.trackLine(line); // Timestamp Range Filtering String timestamp = line.getTimestamp(); if (!query.from.isEmpty() && (timestamp.compareTo(query.from) < 0)) { continue; } if (!query.to.isEmpty() && (timestamp.compareTo(query.to) > 0) && !timestamp.startsWith(query.to)) { if (query.matchType == MatchType.exact) { break; } else { continue; } } // Check regex matcher if it exists if ((filterMatcher != null) && !filterMatcher.include(line)) { continue; } // Check collapser if ((collapser != null) && !collapser.include(line)) { continue; } // Filter to only include output fields if (outputFields != null) { line = new CDXLine(line, outputFields); } writeCount += outputProcessor.writeLine(line); if (Thread.interrupted()) { break; } } if (query.showResumeKey && (line != null) && (writeLimit > 0) && (writeCount >= writeLimit)) { StringBuilder sb = new StringBuilder(); sb.append(line.getUrlKey()); sb.append(' '); sb.append(UrlSurtRangeComputer.incLastChar(line.getTimestamp())); String resumeKey; try { resumeKey = URLEncoder.encode(sb.toString(), "UTF-8"); outputProcessor.writeResumeKey(resumeKey); } catch (UnsupportedEncodingException e) { } } outputProcessor.end(); }