@Override public void setConfig(Config config) { super.setConfig(config); keepImages = config.get("keep.image.only.docs", true); String fileName = config.get("docs.file", null); if (fileName == null) { throw new IllegalArgumentException("docs.file must be set"); } file = new File(fileName).getAbsoluteFile(); }
@Override public void setConfig(Config config) { super.setConfig(config); File workDir = new File(config.get("work.dir", "work")); String d = config.get("docs.dir", "dir-out"); dataDir = new File(d); if (!dataDir.isAbsolute()) { dataDir = new File(workDir, d); } inputFiles = new Iterator(dataDir); if (inputFiles == null) { throw new RuntimeException("No txt files in dataDir: " + dataDir.getAbsolutePath()); } }
@Override public void setConfig(Config config) { super.setConfig(config); // dirs File workDir = new File(config.get("work.dir", "work")); String d = config.get("docs.dir", "trec"); dataDir = new File(d); if (!dataDir.isAbsolute()) { dataDir = new File(workDir, d); } // files collectFiles(dataDir, inputFiles); if (inputFiles.size() == 0) { throw new IllegalArgumentException("No files in dataDir: " + dataDir); } // trec doc parser try { String trecDocParserClassName = config.get("trec.doc.parser", "org.apache.lucene.benchmark.byTask.feeds.TrecGov2Parser"); trecDocParser = Class.forName(trecDocParserClassName).asSubclass(TrecDocParser.class).newInstance(); } catch (Exception e) { // Should not get here. Throw runtime exception. throw new RuntimeException(e); } // html parser try { String htmlParserClassName = config.get("html.parser", "org.apache.lucene.benchmark.byTask.feeds.DemoHTMLParser"); htmlParser = Class.forName(htmlParserClassName).asSubclass(HTMLParser.class).newInstance(); } catch (Exception e) { // Should not get here. Throw runtime exception. throw new RuntimeException(e); } // encoding if (encoding == null) { encoding = "ISO-8859-1"; } // iteration exclusion in doc name excludeDocnameIteration = config.get("content.source.excludeIteration", false); }
@Override public void setConfig(Config config) { super.setConfig(config); String fileName = config.get("docs.file", null); if (fileName == null) { throw new IllegalArgumentException("docs.file must be set"); } file = Paths.get(fileName).toAbsolutePath(); if (encoding == null) { encoding = IOUtils.UTF_8; } }
/** Set the configuration parameters of this doc maker. */ public void setConfig(Config config) { this.config = config; try { String sourceClass = config.get("content.source", "org.apache.lucene.benchmark.byTask.feeds.SingleDocSource"); source = Class.forName(sourceClass).asSubclass(ContentSource.class).newInstance(); source.setConfig(config); } catch (Exception e) { // Should not get here. Throw runtime exception. throw new RuntimeException(e); } boolean stored = config.get("doc.stored", false); boolean bodyStored = config.get("doc.body.stored", stored); boolean tokenized = config.get("doc.tokenized", true); boolean bodyTokenized = config.get("doc.body.tokenized", tokenized); boolean norms = config.get("doc.tokenized.norms", false); boolean bodyNorms = config.get("doc.body.tokenized.norms", true); boolean termVec = config.get("doc.term.vector", false); storeVal = (stored ? Field.Store.YES : Field.Store.NO); bodyStoreVal = (bodyStored ? Field.Store.YES : Field.Store.NO); if (tokenized) { indexVal = norms ? Index.ANALYZED : Index.ANALYZED_NO_NORMS; } else { indexVal = norms ? Index.NOT_ANALYZED : Index.NOT_ANALYZED_NO_NORMS; } if (bodyTokenized) { bodyIndexVal = bodyNorms ? Index.ANALYZED : Index.ANALYZED_NO_NORMS; } else { bodyIndexVal = bodyNorms ? Index.NOT_ANALYZED : Index.NOT_ANALYZED_NO_NORMS; } boolean termVecPositions = config.get("doc.term.vector.positions", false); boolean termVecOffsets = config.get("doc.term.vector.offsets", false); if (termVecPositions && termVecOffsets) { termVecVal = TermVector.WITH_POSITIONS_OFFSETS; } else if (termVecPositions) { termVecVal = TermVector.WITH_POSITIONS; } else if (termVecOffsets) { termVecVal = TermVector.WITH_OFFSETS; } else if (termVec) { termVecVal = TermVector.YES; } else { termVecVal = TermVector.NO; } storeBytes = config.get("doc.store.body.bytes", false); reuseFields = config.get("doc.reuse.fields", true); // In a multi-rounds run, it is important to reset DocState since settings // of fields may change between rounds, and this is the only way to reset // the cache of all threads. docState = new ThreadLocal<DocState>(); indexProperties = config.get("doc.index.props", false); updateDocIDLimit = config.get("doc.random.id.limit", -1); if (updateDocIDLimit != -1) { r = new Random(179); } }