Пример #1
0
  protected void writeCdxResponse(
      CDXWriter responseWriter,
      CloseableIterator<String> cdx,
      int readLimit,
      CDXQuery query,
      AuthToken authToken,
      CDXAccessFilter accessChecker) {

    BaseProcessor outputProcessor = responseWriter;

    if (query.limit < 0) {
      query.limit = Math.min(-query.limit, readLimit);
      outputProcessor = new LastNLineProcessor(outputProcessor, query.limit);
    } else if (query.limit == 0) {
      query.limit = readLimit;
    } else {
      query.limit = Math.min(query.limit, readLimit);
    }

    if (!query.closest.isEmpty() && query.isSortClosest()) {
      outputProcessor = new ClosestTimestampSorted(outputProcessor, query.closest, query.limit);
    }

    // Experimental
    if (query.resolveRevisits) {
      if (query.isReverse()) {
        outputProcessor = new ReverseRevisitResolver(outputProcessor, query.showDupeCount);
      } else {
        outputProcessor = new ForwardRevisitResolver(outputProcessor, query.showDupeCount);
      }
    } else if (query.showDupeCount) {
      outputProcessor = new DupeCountProcessor(outputProcessor, true);
    }

    if (query.showGroupCount || query.showUniqCount) {
      outputProcessor =
          new GroupCountProcessor(outputProcessor, query.lastSkipTimestamp, query.showUniqCount);
    }

    if (query.collapseTime > 0) {
      if (collapseToLast) {
        outputProcessor =
            new DupeTimestampLastBestStatusFilter(
                outputProcessor, query.collapseTime, noCollapsePrefix);
      } else {
        outputProcessor =
            new DupeTimestampBestStatusFilter(
                outputProcessor, query.collapseTime, noCollapsePrefix);
      }
    }

    FieldSplitFormat parseFormat =
        outputProcessor.modifyOutputFormat(cdxLineFactory.getParseFormat());

    FieldRegexFilter filterMatcher = null;

    if (query.filter != null && (query.filter.length > 0)) {
      filterMatcher = new FieldRegexFilter(query.filter, parseFormat);
    }

    CollapseFieldFilter collapser = null;

    if (query.collapse != null && (query.collapse.length > 0)) {
      collapser = new CollapseFieldFilter(query.collapse, parseFormat);
    }

    // CDXLine prev = null;
    CDXLine line = null;

    // boolean prevUrlAllowed = true;

    FieldSplitFormat outputFields = null;

    if (!authChecker.isAllCdxFieldAccessAllowed(authToken)) {
      outputFields = this.authChecker.getPublicCdxFormat();
    }

    if (!query.fl.isEmpty()) {
      if (outputFields == null) {
        outputFields = parseFormat;
      }
      try {
        outputFields = outputFields.createSubset(URLDecoder.decode(query.fl, "UTF-8"));
      } catch (UnsupportedEncodingException e) {

      }
    } else if (outputFields != null) {
      outputFields = parseFormat.createSubset(outputFields);
    }

    outputProcessor.begin();

    int writeCount = 0;
    long allCount = 0;

    int writeLimit = query.limit;

    while (cdx.hasNext()
        && ((writeLimit == 0) || (writeCount < writeLimit))
        && (allCount < readLimit)
        && !responseWriter.isAborted()) {

      String rawLine = cdx.next();
      allCount++;

      if (query.offset > 0) {
        --query.offset;
        continue;
      }

      // prev = line;

      // line = new CDXLine(rawLine, parseFormat);
      line = this.cdxLineFactory.createStandardCDXLine(rawLine, parseFormat);

      // TODO: better way to handle this special case?
      if (line.getMimeType().equals("alexa/dat")) {
        continue;
      }

      // Additional access check, per capture
      if (accessChecker != null) {
        if (!accessChecker.includeCapture(line)) {
          continue;
        }
      }

      //			if (!authChecker.isAllUrlAccessAllowed(authToken)) {
      //				if ((query.matchType != MatchType.exact) && ((prev == null) ||
      // !line.getUrlKey().equals(prev.getUrlKey()))) {
      //					prevUrlAllowed = authChecker.isUrlAllowed(line.getOriginalUrl(), authToken);
      //				}
      //
      //				if (!prevUrlAllowed) {
      //					continue;
      //				}
      //			}
      //
      //			if (!authChecker.isCaptureAllowed(line, authToken)) {
      //				continue;
      //			}
      //
      outputProcessor.trackLine(line);

      // Timestamp Range Filtering
      String timestamp = line.getTimestamp();

      if (!query.from.isEmpty() && (timestamp.compareTo(query.from) < 0)) {
        continue;
      }

      if (!query.to.isEmpty()
          && (timestamp.compareTo(query.to) > 0)
          && !timestamp.startsWith(query.to)) {
        if (query.matchType == MatchType.exact) {
          break;
        } else {
          continue;
        }
      }

      // Check regex matcher if it exists
      if ((filterMatcher != null) && !filterMatcher.include(line)) {
        continue;
      }

      // Check collapser
      if ((collapser != null) && !collapser.include(line)) {
        continue;
      }

      // Filter to only include output fields
      if (outputFields != null) {
        line = new CDXLine(line, outputFields);
      }

      writeCount += outputProcessor.writeLine(line);

      if (Thread.interrupted()) {
        break;
      }
    }

    if (query.showResumeKey && (line != null) && (writeLimit > 0) && (writeCount >= writeLimit)) {
      StringBuilder sb = new StringBuilder();
      sb.append(line.getUrlKey());
      sb.append(' ');
      sb.append(UrlSurtRangeComputer.incLastChar(line.getTimestamp()));
      String resumeKey;
      try {
        resumeKey = URLEncoder.encode(sb.toString(), "UTF-8");
        outputProcessor.writeResumeKey(resumeKey);
      } catch (UnsupportedEncodingException e) {

      }
    }

    outputProcessor.end();
  }
Пример #2
0
  public void getCdx(CDXQuery query, AuthToken authToken, CDXWriter responseWriter)
      throws IOException {
    CloseableIterator<String> iter = null;

    try {
      // Check for wildcards as shortcuts for matchType
      if (query.matchType == null) {
        if (query.url.startsWith("*.")) {
          query.matchType = MatchType.domain;
          query.url = query.url.substring(2);
        } else if (query.url.endsWith("*")) {
          query.matchType = MatchType.prefix;
          query.url = query.url.substring(0, query.url.length() - 1);
        } else {
          query.matchType = MatchType.exact;
        }
      }

      CDXAccessFilter accessChecker = null;

      if (!authChecker.isAllUrlAccessAllowed(authToken)) {
        accessChecker = authChecker.createAccessFilter(authToken);
      }

      //			// For now, don't support domain or host output w/o key as access check is too slow
      //			if (query.matchType == MatchType.domain || query.matchType == MatchType.host) {
      //				if (!authChecker.isAllUrlAccessAllowed(authToken)) {
      //					return;
      //				}
      //			}

      String startEndUrl[] =
          urlSurtRangeComputer.determineRange(query.url, query.matchType, "", "");

      if (startEndUrl == null) {
        responseWriter.printError(
            "Sorry, matchType=" + query.matchType.name() + " is not supported by this server");
        return;
      }

      if ((accessChecker != null) && !accessChecker.includeUrl(startEndUrl[0], query.url)) {
        if (query.showNumPages) {
          // Default to 1 page even if no results
          responseWriter.printNumPages(1, false);
        }
        return;
      }

      if (query.last || query.limit == -1) {
        query.limit = 1;
        query.setSort(SortType.reverse);
      }

      int maxLimit;

      if (query.fastLatest == null) {
        // Optimize: default fastLatest to true for last line or closest
        // sorted results
        if ((query.limit == -1) || (!query.closest.isEmpty() && (query.limit > 0))) {
          query.fastLatest = true;
        } else {
          query.fastLatest = false;
        }
      }

      // Paged query
      if (query.page >= 0 || query.showNumPages) {
        iter = createPagedCdxIterator(startEndUrl, query, authToken, responseWriter);

        if (iter == null) {
          return;
        }

        // Page size determines the max limit here
        maxLimit = Integer.MAX_VALUE;

      } else {
        // Non-Paged Merged query
        iter = createBoundedCdxIterator(startEndUrl, query, null, null);

        // TODO: apply collection-view filtering here. It should happen separately
        // from exclusion check. We'd need to parse CDX lines into CDXLine object
        // before passing it to writeCdxResponse(). Pass CDXFilter to getCdx()?
        // Pass CDX source object that escapsulates collection-view filtering?

        maxLimit = this.queryMaxLimit;
      }

      writeCdxResponse(responseWriter, iter, maxLimit, query, authToken, accessChecker);

    } catch (URIException e) {
      responseWriter.printError(e.toString());
    } catch (URISyntaxException e) {
      responseWriter.printError(e.toString());
    } finally {
      if (iter != null) {
        iter.close();
      }
    }
  }