public static List<String> tokenizeString(Analyzer analyzer, String string) { List<String> result = new ArrayList<String>(); try { TokenStream stream = analyzer.tokenStream(null, new StringReader(string)); stream.reset(); while (stream.incrementToken()) { result.add(stream.getAttribute(CharTermAttribute.class).toString()); } } catch (IOException e) { // not thrown b/c we're using a string reader... throw new RuntimeException(e); } return result; }
public static List<String> getText(BytesWritable value, Boolean tokenizep) throws InterruptedException { Session s = Session.getDefaultInstance(new Properties()); InputStream is = new ByteArrayInputStream(value.getBytes()); List<String> out = new ArrayList<String>(); try { MimeMessage message = new MimeMessage(s, is); message.getAllHeaderLines(); Analyzer standard_analyzer = new StandardAnalyzer(Version.LUCENE_43); Analyzer email_analyzer = new UAX29URLEmailAnalyzer(Version.LUCENE_43); Address[] fromAddrs = message.getFrom(); String fromAddrstr = ""; if (fromAddrs != null) { for (Address addr : fromAddrs) { fromAddrstr += (addr.toString() + " "); } } Address[] toAddrs = message.getAllRecipients(); String toAddrstr = ""; if (toAddrs != null) { for (Address addr : toAddrs) { toAddrstr += (addr.toString() + " "); } } String subject = message.getSubject(); String body = ""; try { Object content = message.getContent(); // System.err.println(content.getContentType()); if (content instanceof String) { body = (String) content; } else if (content instanceof Multipart) { Multipart mp = (Multipart) content; for (int i = 0; i < mp.getCount(); i++) { BodyPart bp = mp.getBodyPart(i); // System.err.println(bp.getContentType()); Object c = bp.getContent(); if (c instanceof String) { body = (String) c; } } } // people do really evil things with email, we're not sorting through it all now } catch (DecodingException e) { System.err.println("DecodingException"); } catch (UnsupportedEncodingException e) { System.err.println("UnsuportedEncodingException"); } catch (IOException e) { System.err.println("IOException"); } if (tokenizep) { List<String> fromData = new ArrayList<String>(); List<String> toData = new ArrayList<String>(); List<String> subjectData = new ArrayList<String>(); List<String> bodyData = new ArrayList<String>(); if (fromAddrstr != null) { fromData = tokenizeString(email_analyzer, fromAddrstr); } if (toAddrstr != null) { toData = tokenizeString(email_analyzer, toAddrstr); } if (subject != null) { subjectData = tokenizeString(standard_analyzer, subject); } if (body != null) { bodyData = tokenizeString(standard_analyzer, body); } out.add("FROM "); out.addAll(fromData); out.add("TO "); out.addAll(toData); out.add("SUBJECT "); out.addAll(subjectData); out.add("BODY "); out.addAll(bodyData); } else { // if not tokenizep, return list with from and subject fields only out.add(fromAddrstr); out.add(subject); } } catch (MessagingException e) { System.err.println("MessagineException"); } return out; }