// Handle additional arguments... protected void setArgs(IndexSchema schema, Map<String, String> args) { // default to STORED, INDEXED, OMIT_TF_POSITIONS and MULTIVALUED depending on schema version properties = (STORED | INDEXED); float schemaVersion = schema.getVersion(); if (schemaVersion < 1.1f) properties |= MULTIVALUED; if (schemaVersion > 1.1f) properties |= OMIT_TF_POSITIONS; if (schemaVersion < 1.3) { args.remove("compressThreshold"); } if (schemaVersion >= 1.6f) properties |= USE_DOCVALUES_AS_STORED; this.args = Collections.unmodifiableMap(args); Map<String, String> initArgs = new HashMap<>(args); initArgs.remove(CLASS_NAME); // consume the class arg trueProperties = FieldProperties.parseProperties(initArgs, true, false); falseProperties = FieldProperties.parseProperties(initArgs, false, false); properties &= ~falseProperties; properties |= trueProperties; for (String prop : FieldProperties.propertyNames) initArgs.remove(prop); init(schema, initArgs); String positionInc = initArgs.get(POSITION_INCREMENT_GAP); if (positionInc != null) { Analyzer analyzer = getIndexAnalyzer(); if (analyzer instanceof SolrAnalyzer) { ((SolrAnalyzer) analyzer).setPositionIncrementGap(Integer.parseInt(positionInc)); } else { throw new RuntimeException( "Can't set " + POSITION_INCREMENT_GAP + " on custom analyzer " + analyzer.getClass()); } analyzer = getQueryAnalyzer(); if (analyzer instanceof SolrAnalyzer) { ((SolrAnalyzer) analyzer).setPositionIncrementGap(Integer.parseInt(positionInc)); } else { throw new RuntimeException( "Can't set " + POSITION_INCREMENT_GAP + " on custom analyzer " + analyzer.getClass()); } initArgs.remove(POSITION_INCREMENT_GAP); } this.postingsFormat = initArgs.remove(POSTINGS_FORMAT); this.docValuesFormat = initArgs.remove(DOC_VALUES_FORMAT); if (initArgs.size() > 0) { throw new RuntimeException( "schema fieldtype " + typeName + "(" + this.getClass().getName() + ")" + " invalid arguments:" + initArgs); } }
/** * Begins the indexing * * @exception BuildException If an error occurs indexing the fileset */ @Override public void execute() throws BuildException { // construct handler and analyzer dynamically try { handler = Class.forName(handlerClassName).asSubclass(DocumentHandler.class).newInstance(); analyzer = IndexTask.createAnalyzer(analyzerClassName); } catch (Exception e) { throw new BuildException(e); } log("Document handler = " + handler.getClass(), Project.MSG_VERBOSE); log("Analyzer = " + analyzer.getClass(), Project.MSG_VERBOSE); if (handler instanceof ConfigurableDocumentHandler) { ((ConfigurableDocumentHandler) handler).configure(handlerConfig.getProperties()); } try { indexDocs(); } catch (IOException e) { throw new BuildException(e); } }
public void testRangeQuery() throws ParseException { for (int i = 0; i < rangeInput.length; i++) { assertEquals( "Testing ranges with analyzer " + a.getClass() + ", input string: " + rangeInput[i], rangeExpected[i], parseWithAnalyzingQueryParser(rangeInput[i], a)); } }
public void testPrefixQuery() throws ParseException { for (int i = 0; i < prefixInput.length; i++) { assertEquals( "Testing prefixes with analyzer " + a.getClass() + ", input string: " + prefixInput[i], prefixExpected[i], parseWithAnalyzingQueryParser(prefixInput[i], a)); } }
public void testWildCardQuery() throws ParseException { for (int i = 0; i < wildcardInput.length; i++) { assertEquals( "Testing wildcards with analyzer " + a.getClass() + ", input string: " + wildcardInput[i], wildcardExpected[i], parseWithAnalyzingQueryParser(wildcardInput[i], a)); } }
public void testFuzzyQuery() throws ParseException { for (int i = 0; i < fuzzyInput.length; i++) { assertEquals( "Testing fuzzys with analyzer " + a.getClass() + ", input string: " + fuzzyInput[i], fuzzyExpected[i], parseWithAnalyzingQueryParser(fuzzyInput[i], a)); } }
private static String[] groupTokens(Analyzer analyzer, String input) throws IOException { if (Resources.debug) { Resources.LOGGER.debug("TokenParser:" + input); Resources.LOGGER.debug("Analyzer:" + analyzer.getClass()); } TokenStream tokenStream = analyzer.tokenStream("input", input); OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class); PositionIncrementAttribute positionIncrementAttribute = tokenStream.addAttribute(PositionIncrementAttribute.class); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); TypeAttribute typeAttribute = tokenStream.addAttribute(TypeAttribute.class); tokenStream.reset(); int position = 0; List<TermInfo> infos = new ArrayList<TermInfo>(); while (tokenStream.incrementToken()) { int increment = positionIncrementAttribute.getPositionIncrement(); if (increment > 0) { position = position + increment; if (Resources.debug) { Resources.LOGGER.debug(position + ":"); } } int startOffset = offsetAttribute.startOffset(); int endOffset = offsetAttribute.endOffset(); String term = charTermAttribute.toString(); TermInfo info = new TermInfo(); info.setStart(startOffset); info.setEnd(endOffset); infos.add(info); if (Resources.debug) { Resources.LOGGER.debug( "[" + term + "]" + ":(" + startOffset + "-->" + endOffset + "):" + typeAttribute.type()); } } tokenStream.end(); tokenStream.close(); Stack<TermInfo> tiStack = groupTokenInfos(infos); List<String> terms = new ArrayList<String>(); while (!tiStack.isEmpty()) { TermInfo termInfo = tiStack.pop(); if (termInfo.getEnd() <= input.length() && termInfo.getStart() >= 1) { String term = input.substring(termInfo.getStart(), termInfo.getEnd()); terms.add(term); } } return terms.toArray(new String[] {}); }
@Override public int doLogic() throws Exception { try { Locale locale = getRunData().getLocale(); if (locale == null) throw new RuntimeException("Locale must be set with the NewLocale task!"); Analyzer analyzer = createAnalyzer(locale, impl); getRunData().setAnalyzer(analyzer); System.out.println( "Changed Analyzer to: " + analyzer.getClass().getName() + "(" + locale + ")"); } catch (Exception e) { throw new RuntimeException("Error creating Analyzer: impl=" + impl, e); } return 1; }
private static String[] mmsegTokens(Analyzer analyzer, String input) throws IOException { if (Resources.debug) { Resources.LOGGER.debug("TokenParser:" + input); Resources.LOGGER.debug("Analyzer:" + analyzer.getClass()); } TokenStream tokenStream = analyzer.tokenStream("input", input); OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class); PositionIncrementAttribute positionIncrementAttribute = tokenStream.addAttribute(PositionIncrementAttribute.class); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); TypeAttribute typeAttribute = tokenStream.addAttribute(TypeAttribute.class); tokenStream.reset(); int position = 0; List<String> tokens = new ArrayList<String>(); while (tokenStream.incrementToken()) { int increment = positionIncrementAttribute.getPositionIncrement(); if (increment > 0) { position = position + increment; if (Resources.debug) { Resources.LOGGER.debug(position + ":"); } } int startOffset = offsetAttribute.startOffset(); int endOffset = offsetAttribute.endOffset(); String term = charTermAttribute.toString(); tokens.add(term); if (Resources.debug) { Resources.LOGGER.debug( "[" + term + "]" + ":(" + startOffset + "-->" + endOffset + "):" + typeAttribute.type()); } } tokenStream.end(); tokenStream.close(); return tokens.toArray(new String[] {}); }
/* (non-Javadoc) * @see org.apache.hadoop.mapred.MapReduceBase#configure(org.apache.hadoop.mapred.JobConf) */ public void configure(JobConf job) { iconf = new IndexUpdateConfiguration(job); analyzer = (Analyzer) ReflectionUtils.newInstance(iconf.getDocumentAnalyzerClass(), job); localAnalysis = (ILocalAnalysis) ReflectionUtils.newInstance(iconf.getLocalAnalysisClass(), job); localAnalysis.configure(job); shards = Shard.getIndexShards(iconf); distributionPolicy = (IDistributionPolicy) ReflectionUtils.newInstance(iconf.getDistributionPolicyClass(), job); distributionPolicy.init(shards); LOG.info("sea.document.analyzer = " + analyzer.getClass().getName()); LOG.info("sea.local.analysis = " + localAnalysis.getClass().getName()); LOG.info(shards.length + " shards = " + iconf.getIndexShards()); LOG.info("sea.distribution.policy = " + distributionPolicy.getClass().getName()); }
private static SimpleOrderedMap<Object> getAnalyzerInfo(Analyzer analyzer) { SimpleOrderedMap<Object> aninfo = new SimpleOrderedMap<Object>(); aninfo.add("className", analyzer.getClass().getName()); if (analyzer instanceof TokenizerChain) { TokenizerChain tchain = (TokenizerChain) analyzer; CharFilterFactory[] cfiltfacs = tchain.getCharFilterFactories(); SimpleOrderedMap<Map<String, Object>> cfilters = new SimpleOrderedMap<Map<String, Object>>(); for (CharFilterFactory cfiltfac : cfiltfacs) { Map<String, Object> tok = new HashMap<String, Object>(); String className = cfiltfac.getClass().getName(); tok.put("className", className); tok.put("args", cfiltfac.getArgs()); cfilters.add(className.substring(className.lastIndexOf('.') + 1), tok); } if (cfilters.size() > 0) { aninfo.add("charFilters", cfilters); } SimpleOrderedMap<Object> tokenizer = new SimpleOrderedMap<Object>(); TokenizerFactory tfac = tchain.getTokenizerFactory(); tokenizer.add("className", tfac.getClass().getName()); tokenizer.add("args", tfac.getArgs()); aninfo.add("tokenizer", tokenizer); TokenFilterFactory[] filtfacs = tchain.getTokenFilterFactories(); SimpleOrderedMap<Map<String, Object>> filters = new SimpleOrderedMap<Map<String, Object>>(); for (TokenFilterFactory filtfac : filtfacs) { Map<String, Object> tok = new HashMap<String, Object>(); String className = filtfac.getClass().getName(); tok.put("className", className); tok.put("args", filtfac.getArgs()); filters.add(className.substring(className.lastIndexOf('.') + 1), tok); } if (filters.size() > 0) { aninfo.add("filters", filters); } } return aninfo; }
public void analyze(String text) throws IOException { List<String> searchlst = new ArrayList<String>(); proposalController.getProposalList().clear(); String query = ""; System.out.println("Analzying \"" + text + "\""); Analyzer analyzer = new RussianAnalyzer(Version.LUCENE_31); System.out.println("\t" + analyzer.getClass().getName() + ":"); System.out.print("\t\t"); TokenStream stream = analyzer.tokenStream("contents", new StringReader(text)); while (true) { if (!stream.incrementToken()) break; AttributeSource token = stream.cloneAttributes(); CharTermAttribute term = (CharTermAttribute) token.addAttribute(CharTermAttribute.class); System.out.print("[" + term.toString() + "] "); // 2 searchlst.add(term.toString()); } int i = 0; for (String param : searchlst) { if (i < searchlst.size() - 1) { query += param + " AND "; } else { query += param; } i++; } _log.info("Запрос для поиска:" + query); startSearch(query); System.out.println("\n"); }
/** * Returns a description of the given analyzer, by either reporting the Analyzer class name (and * optionally luceneMatchVersion) if it's not a TokenizerChain, or if it is, querying each * analysis factory for its name and args. */ protected static SimpleOrderedMap<Object> getAnalyzerProperties(Analyzer analyzer) { SimpleOrderedMap<Object> analyzerProps = new SimpleOrderedMap<>(); if (analyzer instanceof TokenizerChain) { Map<String, String> factoryArgs; TokenizerChain tokenizerChain = (TokenizerChain) analyzer; CharFilterFactory[] charFilterFactories = tokenizerChain.getCharFilterFactories(); if (0 < charFilterFactories.length) { List<SimpleOrderedMap<Object>> charFilterProps = new ArrayList<>(); for (CharFilterFactory charFilterFactory : charFilterFactories) { SimpleOrderedMap<Object> props = new SimpleOrderedMap<>(); props.add(CLASS_NAME, charFilterFactory.getClassArg()); factoryArgs = charFilterFactory.getOriginalArgs(); if (null != factoryArgs) { for (String key : factoryArgs.keySet()) { if (!CLASS_NAME.equals(key)) { if (LUCENE_MATCH_VERSION_PARAM.equals(key)) { if (charFilterFactory.isExplicitLuceneMatchVersion()) { props.add(key, factoryArgs.get(key)); } } else { props.add(key, factoryArgs.get(key)); } } } } charFilterProps.add(props); } analyzerProps.add(CHAR_FILTERS, charFilterProps); } SimpleOrderedMap<Object> tokenizerProps = new SimpleOrderedMap<>(); TokenizerFactory tokenizerFactory = tokenizerChain.getTokenizerFactory(); tokenizerProps.add(CLASS_NAME, tokenizerFactory.getClassArg()); factoryArgs = tokenizerFactory.getOriginalArgs(); if (null != factoryArgs) { for (String key : factoryArgs.keySet()) { if (!CLASS_NAME.equals(key)) { if (LUCENE_MATCH_VERSION_PARAM.equals(key)) { if (tokenizerFactory.isExplicitLuceneMatchVersion()) { tokenizerProps.add(key, factoryArgs.get(key)); } } else { tokenizerProps.add(key, factoryArgs.get(key)); } } } } analyzerProps.add(TOKENIZER, tokenizerProps); TokenFilterFactory[] filterFactories = tokenizerChain.getTokenFilterFactories(); if (0 < filterFactories.length) { List<SimpleOrderedMap<Object>> filterProps = new ArrayList<>(); for (TokenFilterFactory filterFactory : filterFactories) { SimpleOrderedMap<Object> props = new SimpleOrderedMap<>(); props.add(CLASS_NAME, filterFactory.getClassArg()); factoryArgs = filterFactory.getOriginalArgs(); if (null != factoryArgs) { for (String key : factoryArgs.keySet()) { if (!CLASS_NAME.equals(key)) { if (LUCENE_MATCH_VERSION_PARAM.equals(key)) { if (filterFactory.isExplicitLuceneMatchVersion()) { props.add(key, factoryArgs.get(key)); } } else { props.add(key, factoryArgs.get(key)); } } } } filterProps.add(props); } analyzerProps.add(FILTERS, filterProps); } } else { // analyzer is not instanceof TokenizerChain analyzerProps.add(CLASS_NAME, analyzer.getClass().getName()); if (analyzer.getVersion() != Version.LATEST) { analyzerProps.add(LUCENE_MATCH_VERSION_PARAM, analyzer.getVersion().toString()); } } return analyzerProps; }