@Override public void flush(FlowProcess flowProcess, OperationCall<NullContext> operationCall) { LOGGER.info("Flushing FilterAndScoreByUrlAndRobots"); terminate(); super.flush(flowProcess, operationCall); }
@Override public void flush(FlowProcess process, OperationCall<NullContext> operationCall) { LOGGER.info("Flushing FetchBuffer"); terminate(); super.flush(process, operationCall); }
@Override public void prepare(FlowProcess process, OperationCall<NullContext> opCall) { super.prepare(process, opCall); _reader = new SAXReader(new Parser()); _reader.setXMLFilter(new DowngradeXmlFilter(_removeNamespaces)); _reader.setEncoding("UTF-8"); _input = new ParsedDatum(); }
@Override public void cleanup(FlowProcess process, OperationCall<NullContext> operationCall) { LOGGER.info("Cleaning up FetchBuffer"); terminate(); _flowProcess.dumpCounters(); super.cleanup(process, operationCall); }
@Override public void cleanup( FlowProcess flowProcess, cascading.operation.OperationCall<NullContext> operationCall) { LOGGER.info("Cleaning up FilterAndScoreByUrlAndRobots"); terminate(); _flowProcess.dumpCounters(); super.cleanup(flowProcess, operationCall); }
@SuppressWarnings({"unchecked"}) @Override public void prepare(FlowProcess flowProcess, OperationCall<NullContext> operationCall) { super.prepare(flowProcess, operationCall); _flowProcess = new LoggingFlowProcess(flowProcess); _flowProcess.addReporter(new LoggingFlowReporter()); _executor = new ThreadedExecutor( _fetcher.getMaxThreads(), _fetcher.getFetcherPolicy().getRequestTimeout()); _refLock = new Object(); _pendingRefs = new ConcurrentHashMap<String, Long>(); _activeRefs = new ConcurrentHashMap<String, Long>(); _keepCollecting = new AtomicBoolean(true); }
@Override public void cleanup(FlowProcess flowProcess, OperationCall<NullContext> operationCall) { System.out.println(String.format("Skipped %d emails out of %d", _numSkipped, _numEmails)); super.cleanup(flowProcess, operationCall); }
@Override public void prepare(FlowProcess flowProcess, OperationCall<NullContext> operationCall) { super.prepare(flowProcess, operationCall); _numEmails = 0; _emailChars = 0; _numSkipped = 0; _parser = new RFC822Parser(); _content = new StringBuffer(); _handler = new DefaultHandler() { private boolean inParagraph = false; private boolean inQuotes = false; @Override public void startDocument() throws SAXException { super.startDocument(); inParagraph = false; inQuotes = false; _content.setLength(0); } @Override public void startElement( String uri, String localName, String qName, Attributes attributes) throws SAXException { if (localName.equalsIgnoreCase("p")) { inParagraph = true; } else if (localName.equalsIgnoreCase("q")) { // FUTURE the RFC822 parser from Tika isn't adding quote elements to text, so // currently this does nothing. inQuotes = true; } else if (localName.equalsIgnoreCase("br")) { _content.append('\n'); } else if (localName.equalsIgnoreCase("meta")) { // do nothing } } @Override public void endElement(String uri, String localName, String name) throws SAXException { if (localName.equalsIgnoreCase("p")) { inParagraph = false; _content.append('\n'); } else if (localName.equalsIgnoreCase("q")) { inQuotes = false; } } @Override public void characters(char[] ch, int start, int length) throws SAXException { if (inParagraph && !inQuotes) { // We have text we want to process. _content.append(ch, start, length); // HACK - parser isn't putting spaces or breaks between lines. _content.append(' '); } } }; }
@Override public void prepare(FlowProcess flowProcess, OperationCall<Random> operationCall) { super.prepare(flowProcess, operationCall); operationCall.setContext(new Random(seed)); }