Ejemplo n.º 1
0
 private void parseCandidateGenerator(
     XContentParser parser,
     SearchContext context,
     String fieldName,
     PhraseSuggestionContext.DirectCandidateGenerator generator)
     throws IOException {
   if (!SuggestUtils.parseDirectSpellcheckerSettings(parser, fieldName, generator)) {
     if ("field".equals(fieldName)) {
       generator.setField(parser.text());
     } else if ("size".equals(fieldName)) {
       generator.size(parser.intValue());
     } else if ("pre_filter".equals(fieldName) || "preFilter".equals(fieldName)) {
       String analyzerName = parser.text();
       Analyzer analyzer = context.mapperService().analysisService().analyzer(analyzerName);
       if (analyzer == null) {
         throw new ElasticSearchIllegalArgumentException(
             "Analyzer [" + analyzerName + "] doesn't exists");
       }
       generator.preFilter(analyzer);
     } else if ("post_filter".equals(fieldName) || "postFilter".equals(fieldName)) {
       String analyzerName = parser.text();
       Analyzer analyzer = context.mapperService().analysisService().analyzer(analyzerName);
       if (analyzer == null) {
         throw new ElasticSearchIllegalArgumentException(
             "Analyzer [" + analyzerName + "] doesn't exists");
       }
       generator.postFilter(analyzer);
     } else {
       throw new ElasticSearchIllegalArgumentException(
           "CandidateGenerator doesn't support [" + fieldName + "]");
     }
   }
 }
Ejemplo n.º 2
0
  public SuggestionSearchContext.SuggestionContext parse(
      XContentParser parser, SearchContext context) throws IOException {
    PhraseSuggestionContext suggestion = new PhraseSuggestionContext(suggester);
    XContentParser.Token token;
    String fieldName = null;
    boolean gramSizeSet = false;
    while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) {
      if (token == XContentParser.Token.FIELD_NAME) {
        fieldName = parser.currentName();
      } else if (token.isValue()) {
        if (!SuggestUtils.parseSuggestContext(parser, context, fieldName, suggestion)) {
          if ("real_word_error_likelihood".equals(fieldName)) {
            suggestion.setRealWordErrorLikelihood(parser.floatValue());
            if (suggestion.realworldErrorLikelyhood() <= 0.0) {
              throw new ElasticSearchIllegalArgumentException(
                  "real_word_error_likelihood must be > 0.0");
            }
          } else if ("confidence".equals(fieldName)) {
            suggestion.setConfidence(parser.floatValue());
            if (suggestion.confidence() < 0.0) {
              throw new ElasticSearchIllegalArgumentException("confidence must be >= 0.0");
            }
          } else if ("separator".equals(fieldName)) {
            suggestion.setSeparator(new BytesRef(parser.text()));
          } else if ("max_errors".equals(fieldName)) {
            suggestion.setMaxErrors(parser.floatValue());
            if (suggestion.maxErrors() <= 0.0) {
              throw new ElasticSearchIllegalArgumentException("max_error must be > 0.0");
            }
          } else if ("gram_size".equals(fieldName)) {
            suggestion.setGramSize(parser.intValue());
            if (suggestion.gramSize() < 1) {
              throw new ElasticSearchIllegalArgumentException("gram_size must be >= 1");
            }
            gramSizeSet = true;
          } else if ("force_unigrams".equals(fieldName)) {
            suggestion.setRequireUnigram(parser.booleanValue());
          }
        }
      } else if (token == Token.START_ARRAY) {
        if ("direct_generator".equals(fieldName)) {
          // for now we only have a single type of generators
          while ((token = parser.nextToken()) == Token.START_OBJECT) {
            PhraseSuggestionContext.DirectCandidateGenerator generator =
                new PhraseSuggestionContext.DirectCandidateGenerator();
            while ((token = parser.nextToken()) != Token.END_OBJECT) {
              if (token == XContentParser.Token.FIELD_NAME) {
                fieldName = parser.currentName();
              }
              if (token.isValue()) {
                parseCandidateGenerator(parser, context, fieldName, generator);
              }
            }
            verifyGenerator(context, generator);
            suggestion.addGenerator(generator);
          }
        } else {
          throw new ElasticSearchIllegalArgumentException(
              "suggester[phrase]  doesn't support array field [" + fieldName + "]");
        }
      } else if (token == Token.START_OBJECT) {
        if ("linear".equals(fieldName)) {
          ensureNoSmoothing(suggestion);
          final double[] lambdas = new double[3];
          while ((token = parser.nextToken()) != Token.END_OBJECT) {
            if (token == XContentParser.Token.FIELD_NAME) {
              fieldName = parser.currentName();
            }
            if (token.isValue()) {
              if ("trigram_lambda".equals(fieldName)) {
                lambdas[0] = parser.doubleValue();
                if (lambdas[0] < 0) {
                  throw new ElasticSearchIllegalArgumentException(
                      "trigram_lambda must be positive");
                }
              }
              if ("bigram_lambda".equals(fieldName)) {
                lambdas[1] = parser.doubleValue();
                if (lambdas[1] < 0) {
                  throw new ElasticSearchIllegalArgumentException("bigram_lambda must be positive");
                }
              }
              if ("unigram_lambda".equals(fieldName)) {
                lambdas[2] = parser.doubleValue();
                if (lambdas[2] < 0) {
                  throw new ElasticSearchIllegalArgumentException(
                      "unigram_lambda must be positive");
                }
              }
            }
          }
          double sum = 0.0d;
          for (int i = 0; i < lambdas.length; i++) {
            sum += lambdas[i];
          }
          if (Math.abs(sum - 1.0) > 0.001) {
            throw new ElasticSearchIllegalArgumentException(
                "linear smoothing lambdas must sum to 1");
          }
          suggestion.setModel(
              new WordScorer.WordScorerFactory() {
                @Override
                public WordScorer newScorer(
                    IndexReader reader, String field, double realWordLikelyhood, BytesRef separator)
                    throws IOException {
                  return new LinearInterpoatingScorer(
                      reader,
                      field,
                      realWordLikelyhood,
                      separator,
                      lambdas[0],
                      lambdas[1],
                      lambdas[2]);
                }
              });
        } else if ("laplace".equals(fieldName)) {
          ensureNoSmoothing(suggestion);
          double theAlpha = 0.5;

          while ((token = parser.nextToken()) != Token.END_OBJECT) {
            if (token == XContentParser.Token.FIELD_NAME) {
              fieldName = parser.currentName();
            }
            if (token.isValue()) {
              if ("alpha".equals(fieldName)) {
                theAlpha = parser.doubleValue();
              }
            }
          }
          final double alpha = theAlpha;
          suggestion.setModel(
              new WordScorer.WordScorerFactory() {
                @Override
                public WordScorer newScorer(
                    IndexReader reader, String field, double realWordLikelyhood, BytesRef separator)
                    throws IOException {
                  return new LaplaceScorer(reader, field, realWordLikelyhood, separator, alpha);
                }
              });

        } else if ("stupid_backoff".equals(fieldName)) {
          ensureNoSmoothing(suggestion);
          double theDiscount = 0.4;
          while ((token = parser.nextToken()) != Token.END_OBJECT) {
            if (token == XContentParser.Token.FIELD_NAME) {
              fieldName = parser.currentName();
            }
            if (token.isValue()) {
              if ("discount".equals(fieldName)) {
                theDiscount = parser.doubleValue();
              }
            }
          }
          final double discount = theDiscount;
          suggestion.setModel(
              new WordScorer.WordScorerFactory() {
                @Override
                public WordScorer newScorer(
                    IndexReader reader, String field, double realWordLikelyhood, BytesRef separator)
                    throws IOException {
                  return new StupidBackoffScorer(
                      reader, field, realWordLikelyhood, separator, discount);
                }
              });

        } else {
          throw new ElasticSearchIllegalArgumentException(
              "suggester[phrase] doesn't support object field [" + fieldName + "]");
        }

      } else {
        throw new ElasticSearchIllegalArgumentException(
            "suggester[phrase] doesn't support field [" + fieldName + "]");
      }
    }

    if (suggestion.getField() == null) {
      throw new ElasticSearchIllegalArgumentException("The required field option is missing");
    }

    if (suggestion.model() == null) {
      suggestion.setModel(LaplaceScorer.FACTORY);
    }

    if (!gramSizeSet || suggestion.generators().isEmpty()) {
      final ShingleTokenFilterFactory shingleFilterFactory =
          SuggestUtils.getShingleFilterFactory(
              suggestion.getAnalyzer() == null
                  ? context.mapperService().fieldSearchAnalyzer(suggestion.getField())
                  : suggestion.getAnalyzer());
      ;
      if (!gramSizeSet) {
        // try to detect the shingle size
        if (shingleFilterFactory != null) {
          suggestion.setGramSize(shingleFilterFactory.getMaxShingleSize());
          if (suggestion.getAnalyzer() == null
              && shingleFilterFactory.getMinShingleSize() > 1
              && !shingleFilterFactory.getOutputUnigrams()) {
            throw new ElasticSearchIllegalArgumentException(
                "The default analyzer for field: ["
                    + suggestion.getField()
                    + "] doesn't emit unigrams. If this is intentional try to set the analyzer explicitly");
          }
        }
      }
      if (suggestion.generators().isEmpty()) {
        if (shingleFilterFactory != null
            && shingleFilterFactory.getMinShingleSize() > 1
            && !shingleFilterFactory.getOutputUnigrams()
            && suggestion.getRequireUnigram()) {
          throw new ElasticSearchIllegalArgumentException(
              "The default candidate generator for phrase suggest can't operate on field: ["
                  + suggestion.getField()
                  + "] since it doesn't emit unigrams. If this is intentional try to set the candidate generator field explicitly");
        }
        // use a default generator on the same field
        DirectCandidateGenerator generator = new DirectCandidateGenerator();
        generator.setField(suggestion.getField());
        suggestion.addGenerator(generator);
      }
    }

    return suggestion;
  }
  /*
   * More Ideas:
   *   - add ability to find whitespace problems -> we can build a poor mans decompounder with our index based on a automaton?
   *   - add ability to build different error models maybe based on a confusion matrix?
   *   - try to combine a token with its subsequent token to find / detect word splits (optional)
   *      - for this to work we need some way to defined the position length of a candidate
   *   - phonetic filters could be interesting here too for candidate selection
   */
  @Override
  public Suggestion<? extends Entry<? extends Option>> innerExecute(
      String name,
      PhraseSuggestionContext suggestion,
      IndexSearcher searcher,
      CharsRefBuilder spare)
      throws IOException {
    double realWordErrorLikelihood = suggestion.realworldErrorLikelyhood();
    final PhraseSuggestion response = new PhraseSuggestion(name, suggestion.getSize());
    final IndexReader indexReader = searcher.getIndexReader();
    List<PhraseSuggestionContext.DirectCandidateGenerator> generators = suggestion.generators();
    final int numGenerators = generators.size();
    final List<CandidateGenerator> gens = new ArrayList<>(generators.size());
    for (int i = 0; i < numGenerators; i++) {
      PhraseSuggestionContext.DirectCandidateGenerator generator = generators.get(i);
      DirectSpellChecker directSpellChecker = SuggestUtils.getDirectSpellChecker(generator);
      Terms terms = MultiFields.getTerms(indexReader, generator.field());
      if (terms != null) {
        gens.add(
            new DirectCandidateGenerator(
                directSpellChecker,
                generator.field(),
                generator.suggestMode(),
                indexReader,
                realWordErrorLikelihood,
                generator.size(),
                generator.preFilter(),
                generator.postFilter(),
                terms));
      }
    }
    final String suggestField = suggestion.getField();
    final Terms suggestTerms = MultiFields.getTerms(indexReader, suggestField);
    if (gens.size() > 0 && suggestTerms != null) {
      final NoisyChannelSpellChecker checker =
          new NoisyChannelSpellChecker(
              realWordErrorLikelihood, suggestion.getRequireUnigram(), suggestion.getTokenLimit());
      final BytesRef separator = suggestion.separator();
      TokenStream stream =
          checker.tokenStream(
              suggestion.getAnalyzer(), suggestion.getText(), spare, suggestion.getField());

      WordScorer wordScorer =
          suggestion
              .model()
              .newScorer(
                  indexReader, suggestTerms, suggestField, realWordErrorLikelihood, separator);
      Result checkerResult =
          checker.getCorrections(
              stream,
              new MultiCandidateGeneratorWrapper(
                  suggestion.getShardSize(), gens.toArray(new CandidateGenerator[gens.size()])),
              suggestion.maxErrors(),
              suggestion.getShardSize(),
              wordScorer,
              suggestion.confidence(),
              suggestion.gramSize());

      PhraseSuggestion.Entry resultEntry =
          buildResultEntry(suggestion, spare, checkerResult.cutoffScore);
      response.addTerm(resultEntry);

      final BytesRefBuilder byteSpare = new BytesRefBuilder();
      final EarlyTerminatingCollector collector = Lucene.createExistsCollector();
      final CompiledScript collateScript;
      if (suggestion.getCollateQueryScript() != null) {
        collateScript = suggestion.getCollateQueryScript();
      } else if (suggestion.getCollateFilterScript() != null) {
        collateScript = suggestion.getCollateFilterScript();
      } else {
        collateScript = null;
      }
      final boolean collatePrune = (collateScript != null) && suggestion.collatePrune();
      for (int i = 0; i < checkerResult.corrections.length; i++) {
        Correction correction = checkerResult.corrections[i];
        spare.copyUTF8Bytes(correction.join(SEPARATOR, byteSpare, null, null));
        boolean collateMatch = true;
        if (collateScript != null) {
          // Checks if the template query collateScript yields any documents
          // from the index for a correction, collateMatch is updated
          final Map<String, Object> vars = suggestion.getCollateScriptParams();
          vars.put(SUGGESTION_TEMPLATE_VAR_NAME, spare.toString());
          final ExecutableScript executable = scriptService.executable(collateScript, vars);
          final BytesReference querySource = (BytesReference) executable.run();
          final ParsedQuery parsedQuery;
          if (suggestion.getCollateFilterScript() != null) {
            parsedQuery =
                suggestion
                    .getQueryParserService()
                    .parse(
                        QueryBuilders.constantScoreQuery(QueryBuilders.wrapperQuery(querySource)));
          } else {
            parsedQuery = suggestion.getQueryParserService().parse(querySource);
          }
          collateMatch = Lucene.exists(searcher, parsedQuery.query(), collector);
        }
        if (!collateMatch && !collatePrune) {
          continue;
        }
        Text phrase = new StringText(spare.toString());
        Text highlighted = null;
        if (suggestion.getPreTag() != null) {
          spare.copyUTF8Bytes(
              correction.join(
                  SEPARATOR, byteSpare, suggestion.getPreTag(), suggestion.getPostTag()));
          highlighted = new StringText(spare.toString());
        }
        if (collatePrune) {
          resultEntry.addOption(
              new Suggestion.Entry.Option(
                  phrase, highlighted, (float) (correction.score), collateMatch));
        } else {
          resultEntry.addOption(
              new Suggestion.Entry.Option(phrase, highlighted, (float) (correction.score)));
        }
      }
    } else {
      response.addTerm(buildResultEntry(suggestion, spare, Double.MIN_VALUE));
    }
    return response;
  }
  @Test
  public void testDuellCompletions()
      throws IOException, NoSuchFieldException, SecurityException, IllegalArgumentException,
          IllegalAccessException {
    final boolean preserveSeparators = getRandom().nextBoolean();
    final boolean preservePositionIncrements = getRandom().nextBoolean();
    final boolean usePayloads = getRandom().nextBoolean();
    final int options = preserveSeparators ? AnalyzingSuggester.PRESERVE_SEP : 0;

    XAnalyzingSuggester reference =
        new XAnalyzingSuggester(
            new StandardAnalyzer(),
            null,
            new StandardAnalyzer(),
            options,
            256,
            -1,
            preservePositionIncrements,
            null,
            false,
            1,
            XAnalyzingSuggester.SEP_LABEL,
            XAnalyzingSuggester.PAYLOAD_SEP,
            XAnalyzingSuggester.END_BYTE,
            XAnalyzingSuggester.HOLE_CHARACTER);
    LineFileDocs docs = new LineFileDocs(getRandom());
    int num = scaledRandomIntBetween(150, 300);
    final String[] titles = new String[num];
    final long[] weights = new long[num];
    for (int i = 0; i < titles.length; i++) {
      Document nextDoc = docs.nextDoc();
      IndexableField field = nextDoc.getField("title");
      titles[i] = field.stringValue();
      weights[i] = between(0, 100);
    }
    docs.close();
    final InputIterator primaryIter =
        new InputIterator() {
          int index = 0;
          long currentWeight = -1;

          @Override
          public BytesRef next() throws IOException {
            if (index < titles.length) {
              currentWeight = weights[index];
              return new BytesRef(titles[index++]);
            }
            return null;
          }

          @Override
          public long weight() {
            return currentWeight;
          }

          @Override
          public BytesRef payload() {
            return null;
          }

          @Override
          public boolean hasPayloads() {
            return false;
          }

          @Override
          public Set<BytesRef> contexts() {
            return null;
          }

          @Override
          public boolean hasContexts() {
            return false;
          }
        };
    InputIterator iter;
    if (usePayloads) {
      iter =
          new InputIterator() {
            @Override
            public long weight() {
              return primaryIter.weight();
            }

            @Override
            public BytesRef next() throws IOException {
              return primaryIter.next();
            }

            @Override
            public BytesRef payload() {
              return new BytesRef(Long.toString(weight()));
            }

            @Override
            public boolean hasPayloads() {
              return true;
            }

            @Override
            public Set<BytesRef> contexts() {
              return null;
            }

            @Override
            public boolean hasContexts() {
              return false;
            }
          };
    } else {
      iter = primaryIter;
    }
    reference.build(iter);
    PostingsFormat provider = PostingsFormat.forName(Lucene.LATEST_POSTINGS_FORMAT);

    NamedAnalyzer namedAnalzyer = new NamedAnalyzer("foo", new StandardAnalyzer());
    final CompletionFieldMapper mapper =
        new CompletionFieldMapper(
            new Names("foo"),
            namedAnalzyer,
            namedAnalzyer,
            provider,
            null,
            usePayloads,
            preserveSeparators,
            preservePositionIncrements,
            Integer.MAX_VALUE,
            indexSettings,
            AbstractFieldMapper.MultiFields.empty(),
            null,
            ContextMapping.EMPTY_MAPPING);
    Lookup buildAnalyzingLookup = buildAnalyzingLookup(mapper, titles, titles, weights);
    Field field = buildAnalyzingLookup.getClass().getDeclaredField("maxAnalyzedPathsForOneInput");
    field.setAccessible(true);
    Field refField = reference.getClass().getDeclaredField("maxAnalyzedPathsForOneInput");
    refField.setAccessible(true);
    assertThat(refField.get(reference), equalTo(field.get(buildAnalyzingLookup)));

    for (int i = 0; i < titles.length; i++) {
      int res = between(1, 10);
      final StringBuilder builder = new StringBuilder();
      SuggestUtils.analyze(
          namedAnalzyer.tokenStream("foo", titles[i]),
          new SuggestUtils.TokenConsumer() {
            @Override
            public void nextToken() throws IOException {
              if (builder.length() == 0) {
                builder.append(this.charTermAttr.toString());
              }
            }
          });
      String firstTerm = builder.toString();
      String prefix =
          firstTerm.isEmpty() ? "" : firstTerm.substring(0, between(1, firstTerm.length()));
      List<LookupResult> refLookup = reference.lookup(prefix, false, res);
      List<LookupResult> lookup = buildAnalyzingLookup.lookup(prefix, false, res);
      assertThat(refLookup.toString(), lookup.size(), equalTo(refLookup.size()));
      for (int j = 0; j < refLookup.size(); j++) {
        assertThat(lookup.get(j).key, equalTo(refLookup.get(j).key));
        assertThat(
            "prefix: "
                + prefix
                + " "
                + j
                + " -- missmatch cost: "
                + lookup.get(j).key
                + " - "
                + lookup.get(j).value
                + " | "
                + refLookup.get(j).key
                + " - "
                + refLookup.get(j).value,
            lookup.get(j).value,
            equalTo(refLookup.get(j).value));
        assertThat(lookup.get(j).payload, equalTo(refLookup.get(j).payload));
        if (usePayloads) {
          assertThat(
              lookup.get(j).payload.utf8ToString(), equalTo(Long.toString(lookup.get(j).value)));
        }
      }
    }
  }