コード例 #1
0
 public static List<String> tokenizeString(Analyzer analyzer, String string) {
   List<String> result = new ArrayList<String>();
   try {
     TokenStream stream = analyzer.tokenStream(null, new StringReader(string));
     stream.reset();
     while (stream.incrementToken()) {
       result.add(stream.getAttribute(CharTermAttribute.class).toString());
     }
   } catch (IOException e) {
     // not thrown b/c we're using a string reader...
     throw new RuntimeException(e);
   }
   return result;
 }
コード例 #2
0
  /**
   * Analyzes the given TokenStream, collecting the Tokens it produces.
   *
   * @param tokenStream TokenStream to analyze
   * @return List of tokens produced from the TokenStream
   */
  private List<AttributeSource> analyzeTokenStream(TokenStream tokenStream) {
    List<AttributeSource> tokens = new ArrayList<AttributeSource>();
    // for backwards compatibility, add all "common" attributes
    tokenStream.addAttribute(PositionIncrementAttribute.class);
    tokenStream.addAttribute(OffsetAttribute.class);
    tokenStream.addAttribute(TypeAttribute.class);
    final BytesRef bytes = new BytesRef();
    try {
      tokenStream.reset();
      while (tokenStream.incrementToken()) {
        tokens.add(tokenStream.cloneAttributes());
      }
    } catch (IOException ioe) {
      throw new RuntimeException("Error occured while iterating over tokenstream", ioe);
    }

    return tokens;
  }
コード例 #3
0
  /**
   * Converts the list of Tokens to a list of NamedLists representing the tokens.
   *
   * @param tokens Tokens to convert
   * @param context The analysis context
   * @return List of NamedLists containing the relevant information taken from the tokens
   */
  private List<NamedList> convertTokensToNamedLists(
      final List<AttributeSource> tokens, AnalysisContext context) {
    final List<NamedList> tokensNamedLists = new ArrayList<NamedList>();

    final int[] positions = new int[tokens.size()];
    int position = 0;
    for (int i = 0, c = tokens.size(); i < c; i++) {
      AttributeSource token = tokens.get(i);
      position += token.addAttribute(PositionIncrementAttribute.class).getPositionIncrement();
      positions[i] = position;
    }

    // sort the tokens by absoulte position
    new SorterTemplate() {
      @Override
      protected void swap(int i, int j) {
        final int p = positions[i];
        positions[i] = positions[j];
        positions[j] = p;
        Collections.swap(tokens, i, j);
      }

      @Override
      protected int compare(int i, int j) {
        return positions[i] - positions[j];
      }

      @Override
      protected void setPivot(int i) {
        pivot = positions[i];
      }

      @Override
      protected int comparePivot(int j) {
        return pivot - positions[j];
      }

      private int pivot;
    }.mergeSort(0, tokens.size() - 1);

    FieldType fieldType = context.getFieldType();

    final CharArr textBuf = new CharArr();
    for (int i = 0, c = tokens.size(); i < c; i++) {
      AttributeSource token = tokens.get(i);
      final NamedList<Object> tokenNamedList = new SimpleOrderedMap<Object>();
      final TermToBytesRefAttribute termAtt = token.getAttribute(TermToBytesRefAttribute.class);
      BytesRef rawBytes = termAtt.getBytesRef();
      termAtt.fillBytesRef();

      textBuf.reset();
      fieldType.indexedToReadable(rawBytes, textBuf);
      final String text = textBuf.toString();

      tokenNamedList.add("text", text);

      if (token.hasAttribute(CharTermAttribute.class)) {
        final String rawText = token.getAttribute(CharTermAttribute.class).toString();
        if (!rawText.equals(text)) {
          tokenNamedList.add("raw_text", rawText);
        }
      }

      tokenNamedList.add("raw_bytes", rawBytes.toString());

      if (context.getTermsToMatch().contains(rawBytes)) {
        tokenNamedList.add("match", true);
      }

      tokenNamedList.add("position", positions[i]);

      token.reflectWith(
          new AttributeReflector() {
            public void reflect(Class<? extends Attribute> attClass, String key, Object value) {
              // leave out position and bytes term
              if (TermToBytesRefAttribute.class.isAssignableFrom(attClass)) return;
              if (CharTermAttribute.class.isAssignableFrom(attClass)) return;
              if (PositionIncrementAttribute.class.isAssignableFrom(attClass)) return;

              String k = attClass.getName() + '#' + key;

              // map keys for "standard attributes":
              if (ATTRIBUTE_MAPPING.containsKey(k)) {
                k = ATTRIBUTE_MAPPING.get(k);
              }

              if (value instanceof Payload) {
                final Payload p = (Payload) value;
                value = new BytesRef(p.getData()).toString();
              }

              tokenNamedList.add(k, value);
            }
          });

      tokensNamedLists.add(tokenNamedList);
    }

    return tokensNamedLists;
  }
コード例 #4
0
  public static List<String> getText(BytesWritable value, Boolean tokenizep)
      throws InterruptedException {
    Session s = Session.getDefaultInstance(new Properties());
    InputStream is = new ByteArrayInputStream(value.getBytes());
    List<String> out = new ArrayList<String>();
    try {
      MimeMessage message = new MimeMessage(s, is);
      message.getAllHeaderLines();

      Analyzer standard_analyzer = new StandardAnalyzer(Version.LUCENE_43);
      Analyzer email_analyzer = new UAX29URLEmailAnalyzer(Version.LUCENE_43);

      Address[] fromAddrs = message.getFrom();
      String fromAddrstr = "";
      if (fromAddrs != null) {
        for (Address addr : fromAddrs) {
          fromAddrstr += (addr.toString() + " ");
        }
      }

      Address[] toAddrs = message.getAllRecipients();
      String toAddrstr = "";
      if (toAddrs != null) {
        for (Address addr : toAddrs) {
          toAddrstr += (addr.toString() + " ");
        }
      }

      String subject = message.getSubject();

      String body = "";
      try {
        Object content = message.getContent();
        // System.err.println(content.getContentType());
        if (content instanceof String) {
          body = (String) content;
        } else if (content instanceof Multipart) {
          Multipart mp = (Multipart) content;
          for (int i = 0; i < mp.getCount(); i++) {
            BodyPart bp = mp.getBodyPart(i);
            // System.err.println(bp.getContentType());
            Object c = bp.getContent();
            if (c instanceof String) {
              body = (String) c;
            }
          }
        }
        // people do really evil things with email, we're not sorting through it all now
      } catch (DecodingException e) {
        System.err.println("DecodingException");
      } catch (UnsupportedEncodingException e) {
        System.err.println("UnsuportedEncodingException");
      } catch (IOException e) {
        System.err.println("IOException");
      }

      if (tokenizep) {
        List<String> fromData = new ArrayList<String>();
        List<String> toData = new ArrayList<String>();
        List<String> subjectData = new ArrayList<String>();
        List<String> bodyData = new ArrayList<String>();

        if (fromAddrstr != null) {
          fromData = tokenizeString(email_analyzer, fromAddrstr);
        }
        if (toAddrstr != null) {
          toData = tokenizeString(email_analyzer, toAddrstr);
        }
        if (subject != null) {
          subjectData = tokenizeString(standard_analyzer, subject);
        }
        if (body != null) {
          bodyData = tokenizeString(standard_analyzer, body);
        }

        out.add("FROM ");
        out.addAll(fromData);

        out.add("TO ");
        out.addAll(toData);

        out.add("SUBJECT ");
        out.addAll(subjectData);

        out.add("BODY ");
        out.addAll(bodyData);
      } else {
        // if not tokenizep, return list with from and subject fields only
        out.add(fromAddrstr);
        out.add(subject);
      }

    } catch (MessagingException e) {
      System.err.println("MessagineException");
    }

    return out;
  }