public void operate(FlowProcess flowProcess, FunctionCall<NullContext> functionCall) {
      String filename = functionCall.getArguments().getString("line");
      Tuple result = new Tuple("");

      BufferedReader reader;
      try {
        InputStream stream = new FileInputStream(filename);
        reader = new BufferedReader(new InputStreamReader(stream, "us-ascii"));
      } catch (UnsupportedEncodingException e) {
        throw new RuntimeException("Impossible exception!", e);
      } catch (Exception e) {
        throw new RuntimeException(String.format("Exception splitting mbox file %s", filename), e);
      }

      StringBuilder email = new StringBuilder();
      for (String curLine = safeReadLine(reader); curLine != null; curLine = safeReadLine(reader)) {
        if (curLine.startsWith(MBOX_RECORD_DIVIDER)) {
          if (email.length() > 0) {
            result.setString(0, email.toString());
            functionCall.getOutputCollector().add(result);
          }

          email.setLength(0);
        }

        email.append(curLine);
        email.append('\n');
      }

      // Output the final record.
      if (email.length() > 0) {
        result.setString(0, email.toString());
        functionCall.getOutputCollector().add(result);
      }
    }
Ejemplo n.º 2
0
    @Override
    public void operate(FlowProcess flowProcess, FunctionCall functionCall) {
      URL resource =
          Thread.currentThread().getContextClassLoader().getResource(testClasspathJarContents);

      if (resource == null) throw new RuntimeException("cannot find resource");

      functionCall.getOutputCollector().add(functionCall.getArguments());
    }
Ejemplo n.º 3
0
  @Override
  public void operate(FlowProcess process, FunctionCall<NullContext> funcCall) {
    _input.setTupleEntry(funcCall.getArguments());
    InputStream is = new StringInputStream(_input.getParsedText());

    try {
      Document parsedContent = _reader.read(is);
      process(_input, parsedContent, funcCall.getOutputCollector());
    } catch (Exception e) {
      handleException(_input, e, funcCall.getOutputCollector());
    } finally {
      IoUtils.safeClose(is);
    }
  }
    @Override
    public void operate(FlowProcess flowProcess, FunctionCall<NullContext> functionCall) {
      String email = functionCall.getArguments().getString("email");
      _numEmails += 1;

      Metadata metadata = new Metadata();

      try {
        InputStream stream = new ByteArrayInputStream(email.getBytes("UTF-8"));
        _parser.parse(stream, _handler, metadata, new ParseContext());

        // _content now has all of the body text, and metadata has the header info.
        String messageId = getMetadata(metadata, TikaCoreProperties.IDENTIFIER);

        String author = "";
        String address = "";
        String creator = getMetadata(metadata, TikaCoreProperties.CREATOR);
        Matcher addressMatcher = FULL_EMAIL_ADDRESS_PATTERN.matcher(creator);
        if (addressMatcher.matches()) {
          author = addressMatcher.group(1);
          address = addressMatcher.group(2);
        } else {
          addressMatcher = SIMPLE_EMAIL_ADDRESS_PATTERN.matcher(creator);
          if (addressMatcher.matches()) {
            address = addressMatcher.group(1);
          }
        }

        String subject = getMetadata(metadata, TikaCoreProperties.TITLE);
        String replyId = getMetadata(metadata, TikaCoreProperties.RELATION);
        String creationDate = getMetadata(metadata, TikaCoreProperties.CREATED);

        String content = _content.toString();
        _emailChars += content.length();

        // If size is greater than say 4x average, skip it. Otherwise we can get
        // some huge emails when a person includes all of the source code for their
        // project.
        if ((_numEmails > 100) && (content.length() > (4 * _emailChars / _numEmails))) {
          _numSkipped += 1;
          return;
        }

        // Need to convert all CRLF & raw linefeeds into \n sequences, so our file format is
        // correct.
        // We do the same for tabs, so that it's easy to parse the result.
        content = content.replaceAll("\r\n", "\\\\n");
        content = content.replaceAll("[\r\n]", "\\\\n");
        content = content.replaceAll("\t", "\\\\t");

        Tuple tuple =
            new Tuple(messageId, author, address, subject, creationDate, replyId, content);
        functionCall.getOutputCollector().add(tuple);
      } catch (Exception e) {
        LOGGER.error("Exception parsing email: " + e.getMessage());
      } catch (NoClassDefFoundError e) {
        // This will happen when we have an embedded object (multi-part email) which
        // needs parsing support we don't include.
        LOGGER.error("Exception parsing email due to missing class: " + e.getMessage());
      }
    }