public static List<String> tokenizeString(Analyzer analyzer, String string) { List<String> result = new ArrayList<String>(); try { TokenStream stream = analyzer.tokenStream(null, new StringReader(string)); stream.reset(); while (stream.incrementToken()) { result.add(stream.getAttribute(CharTermAttribute.class).toString()); } } catch (IOException e) { // not thrown b/c we're using a string reader... throw new RuntimeException(e); } return result; }
/** * Analyzes the given TokenStream, collecting the Tokens it produces. * * @param tokenStream TokenStream to analyze * @return List of tokens produced from the TokenStream */ private List<AttributeSource> analyzeTokenStream(TokenStream tokenStream) { List<AttributeSource> tokens = new ArrayList<AttributeSource>(); // for backwards compatibility, add all "common" attributes tokenStream.addAttribute(PositionIncrementAttribute.class); tokenStream.addAttribute(OffsetAttribute.class); tokenStream.addAttribute(TypeAttribute.class); final BytesRef bytes = new BytesRef(); try { tokenStream.reset(); while (tokenStream.incrementToken()) { tokens.add(tokenStream.cloneAttributes()); } } catch (IOException ioe) { throw new RuntimeException("Error occured while iterating over tokenstream", ioe); } return tokens; }
/** * Converts the list of Tokens to a list of NamedLists representing the tokens. * * @param tokens Tokens to convert * @param context The analysis context * @return List of NamedLists containing the relevant information taken from the tokens */ private List<NamedList> convertTokensToNamedLists( final List<AttributeSource> tokens, AnalysisContext context) { final List<NamedList> tokensNamedLists = new ArrayList<NamedList>(); final int[] positions = new int[tokens.size()]; int position = 0; for (int i = 0, c = tokens.size(); i < c; i++) { AttributeSource token = tokens.get(i); position += token.addAttribute(PositionIncrementAttribute.class).getPositionIncrement(); positions[i] = position; } // sort the tokens by absoulte position new SorterTemplate() { @Override protected void swap(int i, int j) { final int p = positions[i]; positions[i] = positions[j]; positions[j] = p; Collections.swap(tokens, i, j); } @Override protected int compare(int i, int j) { return positions[i] - positions[j]; } @Override protected void setPivot(int i) { pivot = positions[i]; } @Override protected int comparePivot(int j) { return pivot - positions[j]; } private int pivot; }.mergeSort(0, tokens.size() - 1); FieldType fieldType = context.getFieldType(); final CharArr textBuf = new CharArr(); for (int i = 0, c = tokens.size(); i < c; i++) { AttributeSource token = tokens.get(i); final NamedList<Object> tokenNamedList = new SimpleOrderedMap<Object>(); final TermToBytesRefAttribute termAtt = token.getAttribute(TermToBytesRefAttribute.class); BytesRef rawBytes = termAtt.getBytesRef(); termAtt.fillBytesRef(); textBuf.reset(); fieldType.indexedToReadable(rawBytes, textBuf); final String text = textBuf.toString(); tokenNamedList.add("text", text); if (token.hasAttribute(CharTermAttribute.class)) { final String rawText = token.getAttribute(CharTermAttribute.class).toString(); if (!rawText.equals(text)) { tokenNamedList.add("raw_text", rawText); } } tokenNamedList.add("raw_bytes", rawBytes.toString()); if (context.getTermsToMatch().contains(rawBytes)) { tokenNamedList.add("match", true); } tokenNamedList.add("position", positions[i]); token.reflectWith( new AttributeReflector() { public void reflect(Class<? extends Attribute> attClass, String key, Object value) { // leave out position and bytes term if (TermToBytesRefAttribute.class.isAssignableFrom(attClass)) return; if (CharTermAttribute.class.isAssignableFrom(attClass)) return; if (PositionIncrementAttribute.class.isAssignableFrom(attClass)) return; String k = attClass.getName() + '#' + key; // map keys for "standard attributes": if (ATTRIBUTE_MAPPING.containsKey(k)) { k = ATTRIBUTE_MAPPING.get(k); } if (value instanceof Payload) { final Payload p = (Payload) value; value = new BytesRef(p.getData()).toString(); } tokenNamedList.add(k, value); } }); tokensNamedLists.add(tokenNamedList); } return tokensNamedLists; }
public static List<String> getText(BytesWritable value, Boolean tokenizep) throws InterruptedException { Session s = Session.getDefaultInstance(new Properties()); InputStream is = new ByteArrayInputStream(value.getBytes()); List<String> out = new ArrayList<String>(); try { MimeMessage message = new MimeMessage(s, is); message.getAllHeaderLines(); Analyzer standard_analyzer = new StandardAnalyzer(Version.LUCENE_43); Analyzer email_analyzer = new UAX29URLEmailAnalyzer(Version.LUCENE_43); Address[] fromAddrs = message.getFrom(); String fromAddrstr = ""; if (fromAddrs != null) { for (Address addr : fromAddrs) { fromAddrstr += (addr.toString() + " "); } } Address[] toAddrs = message.getAllRecipients(); String toAddrstr = ""; if (toAddrs != null) { for (Address addr : toAddrs) { toAddrstr += (addr.toString() + " "); } } String subject = message.getSubject(); String body = ""; try { Object content = message.getContent(); // System.err.println(content.getContentType()); if (content instanceof String) { body = (String) content; } else if (content instanceof Multipart) { Multipart mp = (Multipart) content; for (int i = 0; i < mp.getCount(); i++) { BodyPart bp = mp.getBodyPart(i); // System.err.println(bp.getContentType()); Object c = bp.getContent(); if (c instanceof String) { body = (String) c; } } } // people do really evil things with email, we're not sorting through it all now } catch (DecodingException e) { System.err.println("DecodingException"); } catch (UnsupportedEncodingException e) { System.err.println("UnsuportedEncodingException"); } catch (IOException e) { System.err.println("IOException"); } if (tokenizep) { List<String> fromData = new ArrayList<String>(); List<String> toData = new ArrayList<String>(); List<String> subjectData = new ArrayList<String>(); List<String> bodyData = new ArrayList<String>(); if (fromAddrstr != null) { fromData = tokenizeString(email_analyzer, fromAddrstr); } if (toAddrstr != null) { toData = tokenizeString(email_analyzer, toAddrstr); } if (subject != null) { subjectData = tokenizeString(standard_analyzer, subject); } if (body != null) { bodyData = tokenizeString(standard_analyzer, body); } out.add("FROM "); out.addAll(fromData); out.add("TO "); out.addAll(toData); out.add("SUBJECT "); out.addAll(subjectData); out.add("BODY "); out.addAll(bodyData); } else { // if not tokenizep, return list with from and subject fields only out.add(fromAddrstr); out.add(subject); } } catch (MessagingException e) { System.err.println("MessagineException"); } return out; }