Exemple #1
0
  // Handle additional arguments...
  protected void setArgs(IndexSchema schema, Map<String, String> args) {
    // default to STORED, INDEXED, OMIT_TF_POSITIONS and MULTIVALUED depending on schema version
    properties = (STORED | INDEXED);
    float schemaVersion = schema.getVersion();
    if (schemaVersion < 1.1f) properties |= MULTIVALUED;
    if (schemaVersion > 1.1f) properties |= OMIT_TF_POSITIONS;
    if (schemaVersion < 1.3) {
      args.remove("compressThreshold");
    }
    if (schemaVersion >= 1.6f) properties |= USE_DOCVALUES_AS_STORED;

    this.args = Collections.unmodifiableMap(args);
    Map<String, String> initArgs = new HashMap<>(args);
    initArgs.remove(CLASS_NAME); // consume the class arg

    trueProperties = FieldProperties.parseProperties(initArgs, true, false);
    falseProperties = FieldProperties.parseProperties(initArgs, false, false);

    properties &= ~falseProperties;
    properties |= trueProperties;

    for (String prop : FieldProperties.propertyNames) initArgs.remove(prop);

    init(schema, initArgs);

    String positionInc = initArgs.get(POSITION_INCREMENT_GAP);
    if (positionInc != null) {
      Analyzer analyzer = getIndexAnalyzer();
      if (analyzer instanceof SolrAnalyzer) {
        ((SolrAnalyzer) analyzer).setPositionIncrementGap(Integer.parseInt(positionInc));
      } else {
        throw new RuntimeException(
            "Can't set " + POSITION_INCREMENT_GAP + " on custom analyzer " + analyzer.getClass());
      }
      analyzer = getQueryAnalyzer();
      if (analyzer instanceof SolrAnalyzer) {
        ((SolrAnalyzer) analyzer).setPositionIncrementGap(Integer.parseInt(positionInc));
      } else {
        throw new RuntimeException(
            "Can't set " + POSITION_INCREMENT_GAP + " on custom analyzer " + analyzer.getClass());
      }
      initArgs.remove(POSITION_INCREMENT_GAP);
    }

    this.postingsFormat = initArgs.remove(POSTINGS_FORMAT);
    this.docValuesFormat = initArgs.remove(DOC_VALUES_FORMAT);

    if (initArgs.size() > 0) {
      throw new RuntimeException(
          "schema fieldtype "
              + typeName
              + "("
              + this.getClass().getName()
              + ")"
              + " invalid arguments:"
              + initArgs);
    }
  }
  /**
   * Begins the indexing
   *
   * @exception BuildException If an error occurs indexing the fileset
   */
  @Override
  public void execute() throws BuildException {

    // construct handler and analyzer dynamically
    try {
      handler = Class.forName(handlerClassName).asSubclass(DocumentHandler.class).newInstance();

      analyzer = IndexTask.createAnalyzer(analyzerClassName);
    } catch (Exception e) {
      throw new BuildException(e);
    }

    log("Document handler = " + handler.getClass(), Project.MSG_VERBOSE);
    log("Analyzer = " + analyzer.getClass(), Project.MSG_VERBOSE);

    if (handler instanceof ConfigurableDocumentHandler) {
      ((ConfigurableDocumentHandler) handler).configure(handlerConfig.getProperties());
    }

    try {
      indexDocs();
    } catch (IOException e) {
      throw new BuildException(e);
    }
  }
 public void testRangeQuery() throws ParseException {
   for (int i = 0; i < rangeInput.length; i++) {
     assertEquals(
         "Testing ranges with analyzer " + a.getClass() + ", input string: " + rangeInput[i],
         rangeExpected[i],
         parseWithAnalyzingQueryParser(rangeInput[i], a));
   }
 }
 public void testPrefixQuery() throws ParseException {
   for (int i = 0; i < prefixInput.length; i++) {
     assertEquals(
         "Testing prefixes with analyzer " + a.getClass() + ", input string: " + prefixInput[i],
         prefixExpected[i],
         parseWithAnalyzingQueryParser(prefixInput[i], a));
   }
 }
 public void testWildCardQuery() throws ParseException {
   for (int i = 0; i < wildcardInput.length; i++) {
     assertEquals(
         "Testing wildcards with analyzer " + a.getClass() + ", input string: " + wildcardInput[i],
         wildcardExpected[i],
         parseWithAnalyzingQueryParser(wildcardInput[i], a));
   }
 }
 public void testFuzzyQuery() throws ParseException {
   for (int i = 0; i < fuzzyInput.length; i++) {
     assertEquals(
         "Testing fuzzys with analyzer " + a.getClass() + ", input string: " + fuzzyInput[i],
         fuzzyExpected[i],
         parseWithAnalyzingQueryParser(fuzzyInput[i], a));
   }
 }
Exemple #7
0
  private static String[] groupTokens(Analyzer analyzer, String input) throws IOException {
    if (Resources.debug) {
      Resources.LOGGER.debug("TokenParser:" + input);
      Resources.LOGGER.debug("Analyzer:" + analyzer.getClass());
    }
    TokenStream tokenStream = analyzer.tokenStream("input", input);
    OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class);
    PositionIncrementAttribute positionIncrementAttribute =
        tokenStream.addAttribute(PositionIncrementAttribute.class);
    CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
    TypeAttribute typeAttribute = tokenStream.addAttribute(TypeAttribute.class);
    tokenStream.reset();
    int position = 0;

    List<TermInfo> infos = new ArrayList<TermInfo>();
    while (tokenStream.incrementToken()) {
      int increment = positionIncrementAttribute.getPositionIncrement();
      if (increment > 0) {
        position = position + increment;
        if (Resources.debug) {
          Resources.LOGGER.debug(position + ":");
        }
      }

      int startOffset = offsetAttribute.startOffset();
      int endOffset = offsetAttribute.endOffset();
      String term = charTermAttribute.toString();
      TermInfo info = new TermInfo();
      info.setStart(startOffset);
      info.setEnd(endOffset);
      infos.add(info);
      if (Resources.debug) {
        Resources.LOGGER.debug(
            "["
                + term
                + "]"
                + ":("
                + startOffset
                + "-->"
                + endOffset
                + "):"
                + typeAttribute.type());
      }
    }
    tokenStream.end();
    tokenStream.close();

    Stack<TermInfo> tiStack = groupTokenInfos(infos);
    List<String> terms = new ArrayList<String>();
    while (!tiStack.isEmpty()) {
      TermInfo termInfo = tiStack.pop();
      if (termInfo.getEnd() <= input.length() && termInfo.getStart() >= 1) {
        String term = input.substring(termInfo.getStart(), termInfo.getEnd());
        terms.add(term);
      }
    }
    return terms.toArray(new String[] {});
  }
 @Override
 public int doLogic() throws Exception {
   try {
     Locale locale = getRunData().getLocale();
     if (locale == null) throw new RuntimeException("Locale must be set with the NewLocale task!");
     Analyzer analyzer = createAnalyzer(locale, impl);
     getRunData().setAnalyzer(analyzer);
     System.out.println(
         "Changed Analyzer to: " + analyzer.getClass().getName() + "(" + locale + ")");
   } catch (Exception e) {
     throw new RuntimeException("Error creating Analyzer: impl=" + impl, e);
   }
   return 1;
 }
Exemple #9
0
  private static String[] mmsegTokens(Analyzer analyzer, String input) throws IOException {
    if (Resources.debug) {
      Resources.LOGGER.debug("TokenParser:" + input);
      Resources.LOGGER.debug("Analyzer:" + analyzer.getClass());
    }
    TokenStream tokenStream = analyzer.tokenStream("input", input);
    OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class);
    PositionIncrementAttribute positionIncrementAttribute =
        tokenStream.addAttribute(PositionIncrementAttribute.class);
    CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
    TypeAttribute typeAttribute = tokenStream.addAttribute(TypeAttribute.class);
    tokenStream.reset();
    int position = 0;

    List<String> tokens = new ArrayList<String>();
    while (tokenStream.incrementToken()) {
      int increment = positionIncrementAttribute.getPositionIncrement();
      if (increment > 0) {
        position = position + increment;
        if (Resources.debug) {
          Resources.LOGGER.debug(position + ":");
        }
      }

      int startOffset = offsetAttribute.startOffset();
      int endOffset = offsetAttribute.endOffset();
      String term = charTermAttribute.toString();
      tokens.add(term);
      if (Resources.debug) {
        Resources.LOGGER.debug(
            "["
                + term
                + "]"
                + ":("
                + startOffset
                + "-->"
                + endOffset
                + "):"
                + typeAttribute.type());
      }
    }
    tokenStream.end();
    tokenStream.close();
    return tokens.toArray(new String[] {});
  }
  /* (non-Javadoc)
   * @see org.apache.hadoop.mapred.MapReduceBase#configure(org.apache.hadoop.mapred.JobConf)
   */
  public void configure(JobConf job) {
    iconf = new IndexUpdateConfiguration(job);
    analyzer = (Analyzer) ReflectionUtils.newInstance(iconf.getDocumentAnalyzerClass(), job);

    localAnalysis =
        (ILocalAnalysis) ReflectionUtils.newInstance(iconf.getLocalAnalysisClass(), job);
    localAnalysis.configure(job);

    shards = Shard.getIndexShards(iconf);

    distributionPolicy =
        (IDistributionPolicy) ReflectionUtils.newInstance(iconf.getDistributionPolicyClass(), job);
    distributionPolicy.init(shards);

    LOG.info("sea.document.analyzer = " + analyzer.getClass().getName());
    LOG.info("sea.local.analysis = " + localAnalysis.getClass().getName());
    LOG.info(shards.length + " shards = " + iconf.getIndexShards());
    LOG.info("sea.distribution.policy = " + distributionPolicy.getClass().getName());
  }
  private static SimpleOrderedMap<Object> getAnalyzerInfo(Analyzer analyzer) {
    SimpleOrderedMap<Object> aninfo = new SimpleOrderedMap<Object>();
    aninfo.add("className", analyzer.getClass().getName());
    if (analyzer instanceof TokenizerChain) {

      TokenizerChain tchain = (TokenizerChain) analyzer;

      CharFilterFactory[] cfiltfacs = tchain.getCharFilterFactories();
      SimpleOrderedMap<Map<String, Object>> cfilters = new SimpleOrderedMap<Map<String, Object>>();
      for (CharFilterFactory cfiltfac : cfiltfacs) {
        Map<String, Object> tok = new HashMap<String, Object>();
        String className = cfiltfac.getClass().getName();
        tok.put("className", className);
        tok.put("args", cfiltfac.getArgs());
        cfilters.add(className.substring(className.lastIndexOf('.') + 1), tok);
      }
      if (cfilters.size() > 0) {
        aninfo.add("charFilters", cfilters);
      }

      SimpleOrderedMap<Object> tokenizer = new SimpleOrderedMap<Object>();
      TokenizerFactory tfac = tchain.getTokenizerFactory();
      tokenizer.add("className", tfac.getClass().getName());
      tokenizer.add("args", tfac.getArgs());
      aninfo.add("tokenizer", tokenizer);

      TokenFilterFactory[] filtfacs = tchain.getTokenFilterFactories();
      SimpleOrderedMap<Map<String, Object>> filters = new SimpleOrderedMap<Map<String, Object>>();
      for (TokenFilterFactory filtfac : filtfacs) {
        Map<String, Object> tok = new HashMap<String, Object>();
        String className = filtfac.getClass().getName();
        tok.put("className", className);
        tok.put("args", filtfac.getArgs());
        filters.add(className.substring(className.lastIndexOf('.') + 1), tok);
      }
      if (filters.size() > 0) {
        aninfo.add("filters", filters);
      }
    }
    return aninfo;
  }
  public void analyze(String text) throws IOException {
    List<String> searchlst = new ArrayList<String>();

    proposalController.getProposalList().clear();
    String query = "";
    System.out.println("Analzying \"" + text + "\"");

    Analyzer analyzer = new RussianAnalyzer(Version.LUCENE_31);
    System.out.println("\t" + analyzer.getClass().getName() + ":");
    System.out.print("\t\t");
    TokenStream stream = analyzer.tokenStream("contents", new StringReader(text));

    while (true) {
      if (!stream.incrementToken()) break;

      AttributeSource token = stream.cloneAttributes();
      CharTermAttribute term = (CharTermAttribute) token.addAttribute(CharTermAttribute.class);
      System.out.print("[" + term.toString() + "] "); // 2
      searchlst.add(term.toString());
    }

    int i = 0;
    for (String param : searchlst) {

      if (i < searchlst.size() - 1) {
        query += param + " AND ";
      } else {
        query += param;
      }
      i++;
    }

    _log.info("Запрос для поиска:" + query);
    startSearch(query);
    System.out.println("\n");
  }
Exemple #13
0
  /**
   * Returns a description of the given analyzer, by either reporting the Analyzer class name (and
   * optionally luceneMatchVersion) if it's not a TokenizerChain, or if it is, querying each
   * analysis factory for its name and args.
   */
  protected static SimpleOrderedMap<Object> getAnalyzerProperties(Analyzer analyzer) {
    SimpleOrderedMap<Object> analyzerProps = new SimpleOrderedMap<>();

    if (analyzer instanceof TokenizerChain) {
      Map<String, String> factoryArgs;
      TokenizerChain tokenizerChain = (TokenizerChain) analyzer;
      CharFilterFactory[] charFilterFactories = tokenizerChain.getCharFilterFactories();
      if (0 < charFilterFactories.length) {
        List<SimpleOrderedMap<Object>> charFilterProps = new ArrayList<>();
        for (CharFilterFactory charFilterFactory : charFilterFactories) {
          SimpleOrderedMap<Object> props = new SimpleOrderedMap<>();
          props.add(CLASS_NAME, charFilterFactory.getClassArg());
          factoryArgs = charFilterFactory.getOriginalArgs();
          if (null != factoryArgs) {
            for (String key : factoryArgs.keySet()) {
              if (!CLASS_NAME.equals(key)) {
                if (LUCENE_MATCH_VERSION_PARAM.equals(key)) {
                  if (charFilterFactory.isExplicitLuceneMatchVersion()) {
                    props.add(key, factoryArgs.get(key));
                  }
                } else {
                  props.add(key, factoryArgs.get(key));
                }
              }
            }
          }
          charFilterProps.add(props);
        }
        analyzerProps.add(CHAR_FILTERS, charFilterProps);
      }

      SimpleOrderedMap<Object> tokenizerProps = new SimpleOrderedMap<>();
      TokenizerFactory tokenizerFactory = tokenizerChain.getTokenizerFactory();
      tokenizerProps.add(CLASS_NAME, tokenizerFactory.getClassArg());
      factoryArgs = tokenizerFactory.getOriginalArgs();
      if (null != factoryArgs) {
        for (String key : factoryArgs.keySet()) {
          if (!CLASS_NAME.equals(key)) {
            if (LUCENE_MATCH_VERSION_PARAM.equals(key)) {
              if (tokenizerFactory.isExplicitLuceneMatchVersion()) {
                tokenizerProps.add(key, factoryArgs.get(key));
              }
            } else {
              tokenizerProps.add(key, factoryArgs.get(key));
            }
          }
        }
      }
      analyzerProps.add(TOKENIZER, tokenizerProps);

      TokenFilterFactory[] filterFactories = tokenizerChain.getTokenFilterFactories();
      if (0 < filterFactories.length) {
        List<SimpleOrderedMap<Object>> filterProps = new ArrayList<>();
        for (TokenFilterFactory filterFactory : filterFactories) {
          SimpleOrderedMap<Object> props = new SimpleOrderedMap<>();
          props.add(CLASS_NAME, filterFactory.getClassArg());
          factoryArgs = filterFactory.getOriginalArgs();
          if (null != factoryArgs) {
            for (String key : factoryArgs.keySet()) {
              if (!CLASS_NAME.equals(key)) {
                if (LUCENE_MATCH_VERSION_PARAM.equals(key)) {
                  if (filterFactory.isExplicitLuceneMatchVersion()) {
                    props.add(key, factoryArgs.get(key));
                  }
                } else {
                  props.add(key, factoryArgs.get(key));
                }
              }
            }
          }
          filterProps.add(props);
        }
        analyzerProps.add(FILTERS, filterProps);
      }
    } else { // analyzer is not instanceof TokenizerChain
      analyzerProps.add(CLASS_NAME, analyzer.getClass().getName());
      if (analyzer.getVersion() != Version.LATEST) {
        analyzerProps.add(LUCENE_MATCH_VERSION_PARAM, analyzer.getVersion().toString());
      }
    }
    return analyzerProps;
  }