Java List Examples, org.apache.lucene.analysis.tokenattributes.List Java Examples

Example #1

0

Show file

File: MailSorterUtil.java Project: nicolasavru/hadoop-mail-sorter

 public static List<String> tokenizeString(Analyzer analyzer, String string) {
   List<String> result = new ArrayList<String>();
   try {
     TokenStream stream = analyzer.tokenStream(null, new StringReader(string));
     stream.reset();
     while (stream.incrementToken()) {
       result.add(stream.getAttribute(CharTermAttribute.class).toString());
     }
   } catch (IOException e) {
     // not thrown b/c we're using a string reader...
     throw new RuntimeException(e);
   }
   return result;
 }

Example #2

0

Show file

File: AnalysisRequestHandlerBase.java Project: ieure/lucene-solr-snapshot

  /**
   * Analyzes the given TokenStream, collecting the Tokens it produces.
   *
   * @param tokenStream TokenStream to analyze
   * @return List of tokens produced from the TokenStream
   */
  private List<AttributeSource> analyzeTokenStream(TokenStream tokenStream) {
    List<AttributeSource> tokens = new ArrayList<AttributeSource>();
    // for backwards compatibility, add all "common" attributes
    tokenStream.addAttribute(PositionIncrementAttribute.class);
    tokenStream.addAttribute(OffsetAttribute.class);
    tokenStream.addAttribute(TypeAttribute.class);
    final BytesRef bytes = new BytesRef();
    try {
      tokenStream.reset();
      while (tokenStream.incrementToken()) {
        tokens.add(tokenStream.cloneAttributes());
      }
    } catch (IOException ioe) {
      throw new RuntimeException("Error occured while iterating over tokenstream", ioe);
    }

    return tokens;
  }

Example #3

0

Show file

File: KoreanFilter.java Project: skyer9/arirang.lucene-analyzer-v4

  /**
   * Analyze korean text
   *
   * @throws MorphException
   */
  private void analysisKorean(String input) throws MorphException {

    input = trimHangul(input);
    List<AnalysisOutput> outputs = morph.analyze(input);
    if (outputs.size() == 0) return;

    Map<String, KoreanToken> map = new LinkedHashMap<String, KoreanToken>();
    if (hasOrigin) map.put("0:" + input, new KoreanToken(input, offsetAtt.startOffset()));

    extractKeyword(outputs, offsetAtt.startOffset(), map, 0);

    //	  if(outputs.get(0).getScore()>=AnalysisOutput.SCORE_COMPOUNDS) {
    //		extractKeyword(outputs,offsetAtt.startOffset(), map, 0);
    ////	} else {
    ////	  // check whether the input text has some insert spacing errors.
    ////	  List<AnalysisOutput> list = wsAnal.analyze(input);
    ////	  List<AnalysisOutput> results = new ArrayList<AnalysisOutput>();
    ////	  if(list.size()>1 && wsAnal.getOutputScore(list)>AnalysisOutput.SCORE_ANALYSIS) {
    ////		int offset = 0;
    ////		for(AnalysisOutput o : list) {
    ////		  if(hasOrigin) map.put(o.getSource(), new
    // Token(o.getSource(),offsetAtt.startOffset()+offset,1));
    ////		  results.addAll(morph.analyze(o.getSource()));
    ////		  offset += o.getSource().length();
    ////		}
    ////	  } else {
    ////		results.addAll(outputs);
    ////	  }
    ////	  extractKeyword(results, offsetAtt.startOffset(), map, 0);
    //	  }

    Collection<KoreanToken> values = map.values();
    for (KoreanToken kt : values) {
      kt.setOutputs(outputs);
    }

    morphQueue.addAll(map.values());
  }

Example #4

0

Show file

File: AnalysisRequestHandlerBase.java Project: ieure/lucene-solr-snapshot

 @Override
 public void reset() throws IOException {
   super.reset();
   tokenIterator = tokens.iterator();
 }

Example #5

0

Show file

File: AnalysisRequestHandlerBase.java Project: ieure/lucene-solr-snapshot

 /**
  * Creates a new ListBasedTokenStream which uses the given tokens as its token source.
  *
  * @param tokens Source of tokens to be used
  */
 ListBasedTokenStream(List<AttributeSource> tokens) {
   this.tokens = tokens;
   tokenIterator = tokens.iterator();
 }

Example #6

0

Show file

File: AnalysisRequestHandlerBase.java Project: ieure/lucene-solr-snapshot

  /**
   * Converts the list of Tokens to a list of NamedLists representing the tokens.
   *
   * @param tokens Tokens to convert
   * @param context The analysis context
   * @return List of NamedLists containing the relevant information taken from the tokens
   */
  private List<NamedList> convertTokensToNamedLists(
      final List<AttributeSource> tokens, AnalysisContext context) {
    final List<NamedList> tokensNamedLists = new ArrayList<NamedList>();

    final int[] positions = new int[tokens.size()];
    int position = 0;
    for (int i = 0, c = tokens.size(); i < c; i++) {
      AttributeSource token = tokens.get(i);
      position += token.addAttribute(PositionIncrementAttribute.class).getPositionIncrement();
      positions[i] = position;
    }

    // sort the tokens by absoulte position
    new SorterTemplate() {
      @Override
      protected void swap(int i, int j) {
        final int p = positions[i];
        positions[i] = positions[j];
        positions[j] = p;
        Collections.swap(tokens, i, j);
      }

      @Override
      protected int compare(int i, int j) {
        return positions[i] - positions[j];
      }

      @Override
      protected void setPivot(int i) {
        pivot = positions[i];
      }

      @Override
      protected int comparePivot(int j) {
        return pivot - positions[j];
      }

      private int pivot;
    }.mergeSort(0, tokens.size() - 1);

    FieldType fieldType = context.getFieldType();

    final CharArr textBuf = new CharArr();
    for (int i = 0, c = tokens.size(); i < c; i++) {
      AttributeSource token = tokens.get(i);
      final NamedList<Object> tokenNamedList = new SimpleOrderedMap<Object>();
      final TermToBytesRefAttribute termAtt = token.getAttribute(TermToBytesRefAttribute.class);
      BytesRef rawBytes = termAtt.getBytesRef();
      termAtt.fillBytesRef();

      textBuf.reset();
      fieldType.indexedToReadable(rawBytes, textBuf);
      final String text = textBuf.toString();

      tokenNamedList.add("text", text);

      if (token.hasAttribute(CharTermAttribute.class)) {
        final String rawText = token.getAttribute(CharTermAttribute.class).toString();
        if (!rawText.equals(text)) {
          tokenNamedList.add("raw_text", rawText);
        }
      }

      tokenNamedList.add("raw_bytes", rawBytes.toString());

      if (context.getTermsToMatch().contains(rawBytes)) {
        tokenNamedList.add("match", true);
      }

      tokenNamedList.add("position", positions[i]);

      token.reflectWith(
          new AttributeReflector() {
            public void reflect(Class<? extends Attribute> attClass, String key, Object value) {
              // leave out position and bytes term
              if (TermToBytesRefAttribute.class.isAssignableFrom(attClass)) return;
              if (CharTermAttribute.class.isAssignableFrom(attClass)) return;
              if (PositionIncrementAttribute.class.isAssignableFrom(attClass)) return;

              String k = attClass.getName() + '#' + key;

              // map keys for "standard attributes":
              if (ATTRIBUTE_MAPPING.containsKey(k)) {
                k = ATTRIBUTE_MAPPING.get(k);
              }

              if (value instanceof Payload) {
                final Payload p = (Payload) value;
                value = new BytesRef(p.getData()).toString();
              }

              tokenNamedList.add(k, value);
            }
          });

      tokensNamedLists.add(tokenNamedList);
    }

    return tokensNamedLists;
  }

Example #7

0

Show file

File: MailSorterUtil.java Project: nicolasavru/hadoop-mail-sorter

  public static List<String> getText(BytesWritable value, Boolean tokenizep)
      throws InterruptedException {
    Session s = Session.getDefaultInstance(new Properties());
    InputStream is = new ByteArrayInputStream(value.getBytes());
    List<String> out = new ArrayList<String>();
    try {
      MimeMessage message = new MimeMessage(s, is);
      message.getAllHeaderLines();

      Analyzer standard_analyzer = new StandardAnalyzer(Version.LUCENE_43);
      Analyzer email_analyzer = new UAX29URLEmailAnalyzer(Version.LUCENE_43);

      Address[] fromAddrs = message.getFrom();
      String fromAddrstr = "";
      if (fromAddrs != null) {
        for (Address addr : fromAddrs) {
          fromAddrstr += (addr.toString() + " ");
        }
      }

      Address[] toAddrs = message.getAllRecipients();
      String toAddrstr = "";
      if (toAddrs != null) {
        for (Address addr : toAddrs) {
          toAddrstr += (addr.toString() + " ");
        }
      }

      String subject = message.getSubject();

      String body = "";
      try {
        Object content = message.getContent();
        // System.err.println(content.getContentType());
        if (content instanceof String) {
          body = (String) content;
        } else if (content instanceof Multipart) {
          Multipart mp = (Multipart) content;
          for (int i = 0; i < mp.getCount(); i++) {
            BodyPart bp = mp.getBodyPart(i);
            // System.err.println(bp.getContentType());
            Object c = bp.getContent();
            if (c instanceof String) {
              body = (String) c;
            }
          }
        }
        // people do really evil things with email, we're not sorting through it all now
      } catch (DecodingException e) {
        System.err.println("DecodingException");
      } catch (UnsupportedEncodingException e) {
        System.err.println("UnsuportedEncodingException");
      } catch (IOException e) {
        System.err.println("IOException");
      }

      if (tokenizep) {
        List<String> fromData = new ArrayList<String>();
        List<String> toData = new ArrayList<String>();
        List<String> subjectData = new ArrayList<String>();
        List<String> bodyData = new ArrayList<String>();

        if (fromAddrstr != null) {
          fromData = tokenizeString(email_analyzer, fromAddrstr);
        }
        if (toAddrstr != null) {
          toData = tokenizeString(email_analyzer, toAddrstr);
        }
        if (subject != null) {
          subjectData = tokenizeString(standard_analyzer, subject);
        }
        if (body != null) {
          bodyData = tokenizeString(standard_analyzer, body);
        }

        out.add("FROM ");
        out.addAll(fromData);

        out.add("TO ");
        out.addAll(toData);

        out.add("SUBJECT ");
        out.addAll(subjectData);

        out.add("BODY ");
        out.addAll(bodyData);
      } else {
        // if not tokenizep, return list with from and subject fields only
        out.add(fromAddrstr);
        out.add(subject);
      }

    } catch (MessagingException e) {
      System.err.println("MessagineException");
    }

    return out;
  }