コード例 #1
0
 public static List<String> tokenizeString(Analyzer analyzer, String string) {
   List<String> result = new ArrayList<String>();
   try {
     TokenStream stream = analyzer.tokenStream(null, new StringReader(string));
     stream.reset();
     while (stream.incrementToken()) {
       result.add(stream.getAttribute(CharTermAttribute.class).toString());
     }
   } catch (IOException e) {
     // not thrown b/c we're using a string reader...
     throw new RuntimeException(e);
   }
   return result;
 }
コード例 #2
0
  public static List<String> getText(BytesWritable value, Boolean tokenizep)
      throws InterruptedException {
    Session s = Session.getDefaultInstance(new Properties());
    InputStream is = new ByteArrayInputStream(value.getBytes());
    List<String> out = new ArrayList<String>();
    try {
      MimeMessage message = new MimeMessage(s, is);
      message.getAllHeaderLines();

      Analyzer standard_analyzer = new StandardAnalyzer(Version.LUCENE_43);
      Analyzer email_analyzer = new UAX29URLEmailAnalyzer(Version.LUCENE_43);

      Address[] fromAddrs = message.getFrom();
      String fromAddrstr = "";
      if (fromAddrs != null) {
        for (Address addr : fromAddrs) {
          fromAddrstr += (addr.toString() + " ");
        }
      }

      Address[] toAddrs = message.getAllRecipients();
      String toAddrstr = "";
      if (toAddrs != null) {
        for (Address addr : toAddrs) {
          toAddrstr += (addr.toString() + " ");
        }
      }

      String subject = message.getSubject();

      String body = "";
      try {
        Object content = message.getContent();
        // System.err.println(content.getContentType());
        if (content instanceof String) {
          body = (String) content;
        } else if (content instanceof Multipart) {
          Multipart mp = (Multipart) content;
          for (int i = 0; i < mp.getCount(); i++) {
            BodyPart bp = mp.getBodyPart(i);
            // System.err.println(bp.getContentType());
            Object c = bp.getContent();
            if (c instanceof String) {
              body = (String) c;
            }
          }
        }
        // people do really evil things with email, we're not sorting through it all now
      } catch (DecodingException e) {
        System.err.println("DecodingException");
      } catch (UnsupportedEncodingException e) {
        System.err.println("UnsuportedEncodingException");
      } catch (IOException e) {
        System.err.println("IOException");
      }

      if (tokenizep) {
        List<String> fromData = new ArrayList<String>();
        List<String> toData = new ArrayList<String>();
        List<String> subjectData = new ArrayList<String>();
        List<String> bodyData = new ArrayList<String>();

        if (fromAddrstr != null) {
          fromData = tokenizeString(email_analyzer, fromAddrstr);
        }
        if (toAddrstr != null) {
          toData = tokenizeString(email_analyzer, toAddrstr);
        }
        if (subject != null) {
          subjectData = tokenizeString(standard_analyzer, subject);
        }
        if (body != null) {
          bodyData = tokenizeString(standard_analyzer, body);
        }

        out.add("FROM ");
        out.addAll(fromData);

        out.add("TO ");
        out.addAll(toData);

        out.add("SUBJECT ");
        out.addAll(subjectData);

        out.add("BODY ");
        out.addAll(bodyData);
      } else {
        // if not tokenizep, return list with from and subject fields only
        out.add(fromAddrstr);
        out.add(subject);
      }

    } catch (MessagingException e) {
      System.err.println("MessagineException");
    }

    return out;
  }