public void operate(FlowProcess flowProcess, FunctionCall<NullContext> functionCall) { String filename = functionCall.getArguments().getString("line"); Tuple result = new Tuple(""); BufferedReader reader; try { InputStream stream = new FileInputStream(filename); reader = new BufferedReader(new InputStreamReader(stream, "us-ascii")); } catch (UnsupportedEncodingException e) { throw new RuntimeException("Impossible exception!", e); } catch (Exception e) { throw new RuntimeException(String.format("Exception splitting mbox file %s", filename), e); } StringBuilder email = new StringBuilder(); for (String curLine = safeReadLine(reader); curLine != null; curLine = safeReadLine(reader)) { if (curLine.startsWith(MBOX_RECORD_DIVIDER)) { if (email.length() > 0) { result.setString(0, email.toString()); functionCall.getOutputCollector().add(result); } email.setLength(0); } email.append(curLine); email.append('\n'); } // Output the final record. if (email.length() > 0) { result.setString(0, email.toString()); functionCall.getOutputCollector().add(result); } }
@Override public void operate(FlowProcess flowProcess, FunctionCall functionCall) { URL resource = Thread.currentThread().getContextClassLoader().getResource(testClasspathJarContents); if (resource == null) throw new RuntimeException("cannot find resource"); functionCall.getOutputCollector().add(functionCall.getArguments()); }
@Override public void operate(FlowProcess process, FunctionCall<NullContext> funcCall) { _input.setTupleEntry(funcCall.getArguments()); InputStream is = new StringInputStream(_input.getParsedText()); try { Document parsedContent = _reader.read(is); process(_input, parsedContent, funcCall.getOutputCollector()); } catch (Exception e) { handleException(_input, e, funcCall.getOutputCollector()); } finally { IoUtils.safeClose(is); } }
@Override public void operate(FlowProcess flowProcess, FunctionCall<NullContext> functionCall) { String email = functionCall.getArguments().getString("email"); _numEmails += 1; Metadata metadata = new Metadata(); try { InputStream stream = new ByteArrayInputStream(email.getBytes("UTF-8")); _parser.parse(stream, _handler, metadata, new ParseContext()); // _content now has all of the body text, and metadata has the header info. String messageId = getMetadata(metadata, TikaCoreProperties.IDENTIFIER); String author = ""; String address = ""; String creator = getMetadata(metadata, TikaCoreProperties.CREATOR); Matcher addressMatcher = FULL_EMAIL_ADDRESS_PATTERN.matcher(creator); if (addressMatcher.matches()) { author = addressMatcher.group(1); address = addressMatcher.group(2); } else { addressMatcher = SIMPLE_EMAIL_ADDRESS_PATTERN.matcher(creator); if (addressMatcher.matches()) { address = addressMatcher.group(1); } } String subject = getMetadata(metadata, TikaCoreProperties.TITLE); String replyId = getMetadata(metadata, TikaCoreProperties.RELATION); String creationDate = getMetadata(metadata, TikaCoreProperties.CREATED); String content = _content.toString(); _emailChars += content.length(); // If size is greater than say 4x average, skip it. Otherwise we can get // some huge emails when a person includes all of the source code for their // project. if ((_numEmails > 100) && (content.length() > (4 * _emailChars / _numEmails))) { _numSkipped += 1; return; } // Need to convert all CRLF & raw linefeeds into \n sequences, so our file format is // correct. // We do the same for tabs, so that it's easy to parse the result. content = content.replaceAll("\r\n", "\\\\n"); content = content.replaceAll("[\r\n]", "\\\\n"); content = content.replaceAll("\t", "\\\\t"); Tuple tuple = new Tuple(messageId, author, address, subject, creationDate, replyId, content); functionCall.getOutputCollector().add(tuple); } catch (Exception e) { LOGGER.error("Exception parsing email: " + e.getMessage()); } catch (NoClassDefFoundError e) { // This will happen when we have an embedded object (multi-part email) which // needs parsing support we don't include. LOGGER.error("Exception parsing email due to missing class: " + e.getMessage()); } }