@Override
  public void flush(FlowProcess flowProcess, OperationCall<NullContext> operationCall) {
    LOGGER.info("Flushing FilterAndScoreByUrlAndRobots");

    terminate();

    super.flush(flowProcess, operationCall);
  }
Example #2
0
  @Override
  public void flush(FlowProcess process, OperationCall<NullContext> operationCall) {
    LOGGER.info("Flushing FetchBuffer");

    terminate();

    super.flush(process, operationCall);
  }
Example #3
0
  @Override
  public void prepare(FlowProcess process, OperationCall<NullContext> opCall) {
    super.prepare(process, opCall);

    _reader = new SAXReader(new Parser());
    _reader.setXMLFilter(new DowngradeXmlFilter(_removeNamespaces));
    _reader.setEncoding("UTF-8");
    _input = new ParsedDatum();
  }
Example #4
0
  @Override
  public void cleanup(FlowProcess process, OperationCall<NullContext> operationCall) {
    LOGGER.info("Cleaning up FetchBuffer");

    terminate();

    _flowProcess.dumpCounters();
    super.cleanup(process, operationCall);
  }
  @Override
  public void cleanup(
      FlowProcess flowProcess, cascading.operation.OperationCall<NullContext> operationCall) {
    LOGGER.info("Cleaning up FilterAndScoreByUrlAndRobots");

    terminate();

    _flowProcess.dumpCounters();
    super.cleanup(flowProcess, operationCall);
  }
Example #6
0
  @SuppressWarnings({"unchecked"})
  @Override
  public void prepare(FlowProcess flowProcess, OperationCall<NullContext> operationCall) {
    super.prepare(flowProcess, operationCall);

    _flowProcess = new LoggingFlowProcess(flowProcess);
    _flowProcess.addReporter(new LoggingFlowReporter());

    _executor =
        new ThreadedExecutor(
            _fetcher.getMaxThreads(), _fetcher.getFetcherPolicy().getRequestTimeout());

    _refLock = new Object();
    _pendingRefs = new ConcurrentHashMap<String, Long>();
    _activeRefs = new ConcurrentHashMap<String, Long>();

    _keepCollecting = new AtomicBoolean(true);
  }
 @Override
 public void cleanup(FlowProcess flowProcess, OperationCall<NullContext> operationCall) {
   System.out.println(String.format("Skipped %d emails out of %d", _numSkipped, _numEmails));
   super.cleanup(flowProcess, operationCall);
 }
    @Override
    public void prepare(FlowProcess flowProcess, OperationCall<NullContext> operationCall) {
      super.prepare(flowProcess, operationCall);

      _numEmails = 0;
      _emailChars = 0;
      _numSkipped = 0;

      _parser = new RFC822Parser();
      _content = new StringBuffer();

      _handler =
          new DefaultHandler() {
            private boolean inParagraph = false;
            private boolean inQuotes = false;

            @Override
            public void startDocument() throws SAXException {
              super.startDocument();

              inParagraph = false;
              inQuotes = false;
              _content.setLength(0);
            }

            @Override
            public void startElement(
                String uri, String localName, String qName, Attributes attributes)
                throws SAXException {
              if (localName.equalsIgnoreCase("p")) {
                inParagraph = true;
              } else if (localName.equalsIgnoreCase("q")) {
                // FUTURE the RFC822 parser from Tika isn't adding quote elements to text, so
                // currently this does nothing.
                inQuotes = true;
              } else if (localName.equalsIgnoreCase("br")) {
                _content.append('\n');
              } else if (localName.equalsIgnoreCase("meta")) {
                // do nothing
              }
            }

            @Override
            public void endElement(String uri, String localName, String name) throws SAXException {
              if (localName.equalsIgnoreCase("p")) {
                inParagraph = false;
                _content.append('\n');
              } else if (localName.equalsIgnoreCase("q")) {
                inQuotes = false;
              }
            }

            @Override
            public void characters(char[] ch, int start, int length) throws SAXException {
              if (inParagraph && !inQuotes) {
                // We have text we want to process.
                _content.append(ch, start, length);
                // HACK - parser isn't putting spaces or breaks between lines.
                _content.append(' ');
              }
            }
          };
    }
Example #9
0
  @Override
  public void prepare(FlowProcess flowProcess, OperationCall<Random> operationCall) {
    super.prepare(flowProcess, operationCall);

    operationCall.setContext(new Random(seed));
  }