Ejemplo n.º 1
0
 private static Token token(
     String term, int posInc, int posLength, int startOffset, int endOffset) {
   final Token t = new Token(term, startOffset, endOffset);
   t.setPositionIncrement(posInc);
   t.setPositionLength(posLength);
   return t;
 }
 @Test
 public void testCountPositions() throws IOException {
   // We're looking to make sure that we:
   Token t1 = new Token(); // Don't count tokens without an increment
   t1.setPositionIncrement(0);
   Token t2 = new Token();
   t2.setPositionIncrement(1); // Count normal tokens with one increment
   Token t3 = new Token();
   t2.setPositionIncrement(2); // Count funny tokens with more than one increment
   int finalTokenIncrement =
       4; // Count the final token increment on the rare token streams that have them
   Token[] tokens = new Token[] {t1, t2, t3};
   Collections.shuffle(Arrays.asList(tokens), getRandom());
   TokenStream tokenStream = new CannedTokenStream(finalTokenIncrement, 0, tokens);
   assertThat(TokenCountFieldMapper.countPositions(tokenStream), equalTo(7));
 }
Ejemplo n.º 3
0
    private void fillTokens() throws IOException {
      final StringBuilder sb = new StringBuilder();
      final char[] buffer = new char[256];
      while (true) {
        final int count = input.read(buffer);
        if (count == -1) {
          break;
        }
        sb.append(buffer, 0, count);
        // System.out.println("got count=" + count);
      }
      // System.out.println("fillTokens: " + sb);

      inputLength = sb.length();

      final String[] parts = sb.toString().split(" ");

      tokens = new ArrayList<Token>();
      int pos = 0;
      int maxPos = -1;
      int offset = 0;
      // System.out.println("again");
      for (String part : parts) {
        final String[] overlapped = part.split("/");
        boolean firstAtPos = true;
        int minPosLength = Integer.MAX_VALUE;
        for (String part2 : overlapped) {
          final int colonIndex = part2.indexOf(':');
          final String token;
          final int posLength;
          if (colonIndex != -1) {
            token = part2.substring(0, colonIndex);
            posLength = Integer.parseInt(part2.substring(1 + colonIndex));
          } else {
            token = part2;
            posLength = 1;
          }
          maxPos = Math.max(maxPos, pos + posLength);
          minPosLength = Math.min(minPosLength, posLength);
          final Token t = new Token(token, offset, offset + 2 * posLength - 1);
          t.setPositionLength(posLength);
          t.setPositionIncrement(firstAtPos ? 1 : 0);
          firstAtPos = false;
          // System.out.println("  add token=" + t + " startOff=" + t.startOffset() + " endOff=" +
          // t.endOffset());
          tokens.add(t);
        }
        pos += minPosLength;
        offset = 2 * pos;
      }
      assert maxPos <= pos : "input string mal-formed: posLength>1 tokens hang over the end";
    }
 /**
  * For languages with canonical form
  *
  * @return canonical token (or null if none)
  */
 public Token canonizeToken(Token t) {
   if (!hasCanonicalFilter) return null;
   if (lang.equals("sr")) {
     String nt = new SerbianFilter(null).convert(t.termText());
     if (!t.equals(nt)) {
       Token tt = new Token(nt, t.startOffset(), t.endOffset());
       tt.setPositionIncrement(0);
       tt.setType("alias");
       return tt;
     }
   }
   return null;
 }
Ejemplo n.º 5
0
  /**
   * Merge two lists of tokens, producing a single list with manipulated positionIncrements so that
   * the tokens end up at the same position.
   *
   * <p>Example: [a b] merged with [c d] produces [a/b c/d] ('/' denotes tokens in the same
   * position) Example: [a,5 b,2] merged with [c d,4 e,4] produces [c a,5/d b,2 e,2] (a,n means a
   * has posInc=n)
   */
  public static List<Token> mergeTokens(List<Token> lst1, List<Token> lst2) {
    ArrayList<Token> result = new ArrayList<Token>();
    if (lst1 == null || lst2 == null) {
      if (lst2 != null) result.addAll(lst2);
      if (lst1 != null) result.addAll(lst1);
      return result;
    }

    int pos = 0;
    Iterator<Token> iter1 = lst1.iterator();
    Iterator<Token> iter2 = lst2.iterator();
    Token tok1 = iter1.hasNext() ? iter1.next() : null;
    Token tok2 = iter2.hasNext() ? iter2.next() : null;
    int pos1 = tok1 != null ? tok1.getPositionIncrement() : 0;
    int pos2 = tok2 != null ? tok2.getPositionIncrement() : 0;
    while (tok1 != null || tok2 != null) {
      while (tok1 != null && (pos1 <= pos2 || tok2 == null)) {
        Token tok = new Token(tok1.startOffset(), tok1.endOffset(), tok1.type());
        tok.copyBuffer(tok1.buffer(), 0, tok1.length());
        tok.setPositionIncrement(pos1 - pos);
        result.add(tok);
        pos = pos1;
        tok1 = iter1.hasNext() ? iter1.next() : null;
        pos1 += tok1 != null ? tok1.getPositionIncrement() : 0;
      }
      while (tok2 != null && (pos2 <= pos1 || tok1 == null)) {
        Token tok = new Token(tok2.startOffset(), tok2.endOffset(), tok2.type());
        tok.copyBuffer(tok2.buffer(), 0, tok2.length());
        tok.setPositionIncrement(pos2 - pos);
        result.add(tok);
        pos = pos2;
        tok2 = iter2.hasNext() ? iter2.next() : null;
        pos2 += tok2 != null ? tok2.getPositionIncrement() : 0;
      }
    }
    return result;
  }
  private void splitIntoTokens() {
    String term = termAtt.term();
    String[] termParts = splitTerm(term);

    if (termParts.length > 1) {
      int termPos = offsetAtt.startOffset();

      for (int i = 0; i < termParts.length; i++) {
        String termPart = termParts[i];
        int termPartPos = termPos + term.indexOf(termPart);
        int termPartEndPos = termPartPos + termPart.length();

        Token newToken = new Token(termPart, termPartPos, termPartEndPos);
        newToken.setPositionIncrement(0); // in the same position

        tokens.add(newToken);
      }
    }
  }
Ejemplo n.º 7
0
 private static Token token(String term, int posInc, int posLength) {
   final Token t = new Token(term, 0, 0);
   t.setPositionIncrement(posInc);
   t.setPositionLength(posLength);
   return t;
 }