public static List<String> tokenizeString(Analyzer analyzer, String string) { List<String> result = new ArrayList<String>(); try { TokenStream stream = analyzer.tokenStream(null, new StringReader(string)); stream.reset(); while (stream.incrementToken()) { result.add(stream.getAttribute(CharTermAttribute.class).toString()); } } catch (IOException e) { // not thrown b/c we're using a string reader... throw new RuntimeException(e); } return result; }
/** * Analyzes the given TokenStream, collecting the Tokens it produces. * * @param tokenStream TokenStream to analyze * @return List of tokens produced from the TokenStream */ private List<AttributeSource> analyzeTokenStream(TokenStream tokenStream) { List<AttributeSource> tokens = new ArrayList<AttributeSource>(); // for backwards compatibility, add all "common" attributes tokenStream.addAttribute(PositionIncrementAttribute.class); tokenStream.addAttribute(OffsetAttribute.class); tokenStream.addAttribute(TypeAttribute.class); final BytesRef bytes = new BytesRef(); try { tokenStream.reset(); while (tokenStream.incrementToken()) { tokens.add(tokenStream.cloneAttributes()); } } catch (IOException ioe) { throw new RuntimeException("Error occured while iterating over tokenstream", ioe); } return tokens; }
/** * Analyze korean text * * @throws MorphException */ private void analysisKorean(String input) throws MorphException { input = trimHangul(input); List<AnalysisOutput> outputs = morph.analyze(input); if (outputs.size() == 0) return; Map<String, KoreanToken> map = new LinkedHashMap<String, KoreanToken>(); if (hasOrigin) map.put("0:" + input, new KoreanToken(input, offsetAtt.startOffset())); extractKeyword(outputs, offsetAtt.startOffset(), map, 0); // if(outputs.get(0).getScore()>=AnalysisOutput.SCORE_COMPOUNDS) { // extractKeyword(outputs,offsetAtt.startOffset(), map, 0); //// } else { //// // check whether the input text has some insert spacing errors. //// List<AnalysisOutput> list = wsAnal.analyze(input); //// List<AnalysisOutput> results = new ArrayList<AnalysisOutput>(); //// if(list.size()>1 && wsAnal.getOutputScore(list)>AnalysisOutput.SCORE_ANALYSIS) { //// int offset = 0; //// for(AnalysisOutput o : list) { //// if(hasOrigin) map.put(o.getSource(), new // Token(o.getSource(),offsetAtt.startOffset()+offset,1)); //// results.addAll(morph.analyze(o.getSource())); //// offset += o.getSource().length(); //// } //// } else { //// results.addAll(outputs); //// } //// extractKeyword(results, offsetAtt.startOffset(), map, 0); // } Collection<KoreanToken> values = map.values(); for (KoreanToken kt : values) { kt.setOutputs(outputs); } morphQueue.addAll(map.values()); }
@Override public void reset() throws IOException { super.reset(); tokenIterator = tokens.iterator(); }
/** * Creates a new ListBasedTokenStream which uses the given tokens as its token source. * * @param tokens Source of tokens to be used */ ListBasedTokenStream(List<AttributeSource> tokens) { this.tokens = tokens; tokenIterator = tokens.iterator(); }
/** * Converts the list of Tokens to a list of NamedLists representing the tokens. * * @param tokens Tokens to convert * @param context The analysis context * @return List of NamedLists containing the relevant information taken from the tokens */ private List<NamedList> convertTokensToNamedLists( final List<AttributeSource> tokens, AnalysisContext context) { final List<NamedList> tokensNamedLists = new ArrayList<NamedList>(); final int[] positions = new int[tokens.size()]; int position = 0; for (int i = 0, c = tokens.size(); i < c; i++) { AttributeSource token = tokens.get(i); position += token.addAttribute(PositionIncrementAttribute.class).getPositionIncrement(); positions[i] = position; } // sort the tokens by absoulte position new SorterTemplate() { @Override protected void swap(int i, int j) { final int p = positions[i]; positions[i] = positions[j]; positions[j] = p; Collections.swap(tokens, i, j); } @Override protected int compare(int i, int j) { return positions[i] - positions[j]; } @Override protected void setPivot(int i) { pivot = positions[i]; } @Override protected int comparePivot(int j) { return pivot - positions[j]; } private int pivot; }.mergeSort(0, tokens.size() - 1); FieldType fieldType = context.getFieldType(); final CharArr textBuf = new CharArr(); for (int i = 0, c = tokens.size(); i < c; i++) { AttributeSource token = tokens.get(i); final NamedList<Object> tokenNamedList = new SimpleOrderedMap<Object>(); final TermToBytesRefAttribute termAtt = token.getAttribute(TermToBytesRefAttribute.class); BytesRef rawBytes = termAtt.getBytesRef(); termAtt.fillBytesRef(); textBuf.reset(); fieldType.indexedToReadable(rawBytes, textBuf); final String text = textBuf.toString(); tokenNamedList.add("text", text); if (token.hasAttribute(CharTermAttribute.class)) { final String rawText = token.getAttribute(CharTermAttribute.class).toString(); if (!rawText.equals(text)) { tokenNamedList.add("raw_text", rawText); } } tokenNamedList.add("raw_bytes", rawBytes.toString()); if (context.getTermsToMatch().contains(rawBytes)) { tokenNamedList.add("match", true); } tokenNamedList.add("position", positions[i]); token.reflectWith( new AttributeReflector() { public void reflect(Class<? extends Attribute> attClass, String key, Object value) { // leave out position and bytes term if (TermToBytesRefAttribute.class.isAssignableFrom(attClass)) return; if (CharTermAttribute.class.isAssignableFrom(attClass)) return; if (PositionIncrementAttribute.class.isAssignableFrom(attClass)) return; String k = attClass.getName() + '#' + key; // map keys for "standard attributes": if (ATTRIBUTE_MAPPING.containsKey(k)) { k = ATTRIBUTE_MAPPING.get(k); } if (value instanceof Payload) { final Payload p = (Payload) value; value = new BytesRef(p.getData()).toString(); } tokenNamedList.add(k, value); } }); tokensNamedLists.add(tokenNamedList); } return tokensNamedLists; }
public static List<String> getText(BytesWritable value, Boolean tokenizep) throws InterruptedException { Session s = Session.getDefaultInstance(new Properties()); InputStream is = new ByteArrayInputStream(value.getBytes()); List<String> out = new ArrayList<String>(); try { MimeMessage message = new MimeMessage(s, is); message.getAllHeaderLines(); Analyzer standard_analyzer = new StandardAnalyzer(Version.LUCENE_43); Analyzer email_analyzer = new UAX29URLEmailAnalyzer(Version.LUCENE_43); Address[] fromAddrs = message.getFrom(); String fromAddrstr = ""; if (fromAddrs != null) { for (Address addr : fromAddrs) { fromAddrstr += (addr.toString() + " "); } } Address[] toAddrs = message.getAllRecipients(); String toAddrstr = ""; if (toAddrs != null) { for (Address addr : toAddrs) { toAddrstr += (addr.toString() + " "); } } String subject = message.getSubject(); String body = ""; try { Object content = message.getContent(); // System.err.println(content.getContentType()); if (content instanceof String) { body = (String) content; } else if (content instanceof Multipart) { Multipart mp = (Multipart) content; for (int i = 0; i < mp.getCount(); i++) { BodyPart bp = mp.getBodyPart(i); // System.err.println(bp.getContentType()); Object c = bp.getContent(); if (c instanceof String) { body = (String) c; } } } // people do really evil things with email, we're not sorting through it all now } catch (DecodingException e) { System.err.println("DecodingException"); } catch (UnsupportedEncodingException e) { System.err.println("UnsuportedEncodingException"); } catch (IOException e) { System.err.println("IOException"); } if (tokenizep) { List<String> fromData = new ArrayList<String>(); List<String> toData = new ArrayList<String>(); List<String> subjectData = new ArrayList<String>(); List<String> bodyData = new ArrayList<String>(); if (fromAddrstr != null) { fromData = tokenizeString(email_analyzer, fromAddrstr); } if (toAddrstr != null) { toData = tokenizeString(email_analyzer, toAddrstr); } if (subject != null) { subjectData = tokenizeString(standard_analyzer, subject); } if (body != null) { bodyData = tokenizeString(standard_analyzer, body); } out.add("FROM "); out.addAll(fromData); out.add("TO "); out.addAll(toData); out.add("SUBJECT "); out.addAll(subjectData); out.add("BODY "); out.addAll(bodyData); } else { // if not tokenizep, return list with from and subject fields only out.add(fromAddrstr); out.add(subject); } } catch (MessagingException e) { System.err.println("MessagineException"); } return out; }