/** * Same as {@link #makeDocument()}, only this method creates a document of the given size input by * <code>size</code>. */ public Document makeDocument(int size) throws Exception { LeftOver lvr = leftovr.get(); if (lvr == null || lvr.docdata == null || lvr.docdata.getBody() == null || lvr.docdata.getBody().length() == 0) { resetLeftovers(); } DocData docData = getDocState().docData; DocData dd = (lvr == null ? source.getNextDocData(docData) : lvr.docdata); int cnt = (lvr == null ? 0 : lvr.cnt); while (dd.getBody() == null || dd.getBody().length() < size) { DocData dd2 = dd; dd = source.getNextDocData(new DocData()); cnt = 0; dd.setBody(dd2.getBody() + dd.getBody()); } Document doc = createDocument(dd, size, cnt); if (dd.getBody() == null || dd.getBody().length() == 0) { resetLeftovers(); } else { if (lvr == null) { lvr = new LeftOver(); leftovr.set(lvr); } lvr.docdata = dd; lvr.cnt = ++cnt; } return doc; }
/** Reset inputs so that the test run would behave, input wise, as if it just started. */ public synchronized void resetInputs() throws IOException { source.printStatistics("docs"); // re-initiate since properties by round may have changed. setConfig(config); source.resetInputs(); numDocsCreated.set(0); resetLeftovers(); }
public static ContentSource newInstance(URL url) { String host = url.getHost().toLowerCase(); Log.d("matching on: " + host); ContentSource s = sources.get(host); if (s == null || s.getResourceBuilder() == null) { s = ContentSource.GENERIC; } s.getResourceBuilder().setURL(url); return s; }
/** * Set the repository the formatter can load object contents from. * * <p>Once a repository has been set, the formatter must be released to ensure the internal * ObjectReader is able to release its resources. * * @param repository source repository holding referenced objects. */ public void setRepository(Repository repository) { if (reader != null) reader.release(); db = repository; reader = db.newObjectReader(); ContentSource cs = ContentSource.create(reader); source = new ContentSource.Pair(cs, cs); DiffConfig dc = db.getConfig().get(DiffConfig.KEY); if (dc.isNoPrefix()) { setOldPrefix(""); // $NON-NLS-1$ setNewPrefix(""); // $NON-NLS-1$ } setDetectRenames(dc.isRenameDetectionEnabled()); diffAlgorithm = DiffAlgorithm.getAlgorithm( db.getConfig() .getEnum( ConfigConstants.CONFIG_DIFF_SECTION, null, ConfigConstants.CONFIG_KEY_ALGORITHM, SupportedAlgorithm.HISTOGRAM)); }
@Override public boolean equals(Object obj) { if (this == obj) { return true; } if ((obj == null) || (!(obj instanceof PackageVersionContentSource))) { return false; } final PackageVersionContentSource other = (PackageVersionContentSource) obj; if (packageVersion == null) { if (other.packageVersion != null) { return false; } } else if (!packageVersion.equals(other.packageVersion)) { return false; } if (contentSource == null) { if (other.contentSource != null) { return false; } } else if (!contentSource.equals(other.contentSource)) { return false; } return true; }
@Override public int hashCode() { int result = 1; result = (31 * result) + ((packageVersion == null) ? 0 : packageVersion.hashCode()); result = (31 * result) + ((contentSource == null) ? 0 : contentSource.hashCode()); return result; }
@Override public void resetInputs() throws IOException { synchronized (lock) { super.resetInputs(); close(); nextFile = 0; iteration = 0; } }
@Override public void setConfig(Config config) { super.setConfig(config); keepImages = config.get("keep.image.only.docs", true); String fileName = config.get("docs.file", null); if (fileName == null) { throw new IllegalArgumentException("docs.file must be set"); } file = new File(fileName).getAbsoluteFile(); }
@Override public void setConfig(Config config) { super.setConfig(config); String fileName = config.get("docs.file", null); if (fileName == null) { throw new IllegalArgumentException("docs.file must be set"); } file = Paths.get(fileName).toAbsolutePath(); if (encoding == null) { encoding = IOUtils.UTF_8; } }
@Override public void setConfig(Config config) { super.setConfig(config); File workDir = new File(config.get("work.dir", "work")); String d = config.get("docs.dir", "dir-out"); dataDir = new File(d); if (!dataDir.isAbsolute()) { dataDir = new File(workDir, d); } inputFiles = new Iterator(dataDir); if (inputFiles == null) { throw new RuntimeException("No txt files in dataDir: " + dataDir.getAbsolutePath()); } }
@Override public void setConfig(Config config) { super.setConfig(config); // dirs File workDir = new File(config.get("work.dir", "work")); String d = config.get("docs.dir", "trec"); dataDir = new File(d); if (!dataDir.isAbsolute()) { dataDir = new File(workDir, d); } // files collectFiles(dataDir, inputFiles); if (inputFiles.size() == 0) { throw new IllegalArgumentException("No files in dataDir: " + dataDir); } // trec doc parser try { String trecDocParserClassName = config.get("trec.doc.parser", "org.apache.lucene.benchmark.byTask.feeds.TrecGov2Parser"); trecDocParser = Class.forName(trecDocParserClassName).asSubclass(TrecDocParser.class).newInstance(); } catch (Exception e) { // Should not get here. Throw runtime exception. throw new RuntimeException(e); } // html parser try { String htmlParserClassName = config.get("html.parser", "org.apache.lucene.benchmark.byTask.feeds.DemoHTMLParser"); htmlParser = Class.forName(htmlParserClassName).asSubclass(HTMLParser.class).newInstance(); } catch (Exception e) { // Should not get here. Throw runtime exception. throw new RuntimeException(e); } // encoding if (encoding == null) { encoding = "ISO-8859-1"; } // iteration exclusion in doc name excludeDocnameIteration = config.get("content.source.excludeIteration", false); }
/** Set the configuration parameters of this doc maker. */ public void setConfig(Config config) { this.config = config; try { String sourceClass = config.get("content.source", "org.apache.lucene.benchmark.byTask.feeds.SingleDocSource"); source = Class.forName(sourceClass).asSubclass(ContentSource.class).newInstance(); source.setConfig(config); } catch (Exception e) { // Should not get here. Throw runtime exception. throw new RuntimeException(e); } boolean stored = config.get("doc.stored", false); boolean bodyStored = config.get("doc.body.stored", stored); boolean tokenized = config.get("doc.tokenized", true); boolean bodyTokenized = config.get("doc.body.tokenized", tokenized); boolean norms = config.get("doc.tokenized.norms", false); boolean bodyNorms = config.get("doc.body.tokenized.norms", true); boolean termVec = config.get("doc.term.vector", false); storeVal = (stored ? Field.Store.YES : Field.Store.NO); bodyStoreVal = (bodyStored ? Field.Store.YES : Field.Store.NO); if (tokenized) { indexVal = norms ? Index.ANALYZED : Index.ANALYZED_NO_NORMS; } else { indexVal = norms ? Index.NOT_ANALYZED : Index.NOT_ANALYZED_NO_NORMS; } if (bodyTokenized) { bodyIndexVal = bodyNorms ? Index.ANALYZED : Index.ANALYZED_NO_NORMS; } else { bodyIndexVal = bodyNorms ? Index.NOT_ANALYZED : Index.NOT_ANALYZED_NO_NORMS; } boolean termVecPositions = config.get("doc.term.vector.positions", false); boolean termVecOffsets = config.get("doc.term.vector.offsets", false); if (termVecPositions && termVecOffsets) { termVecVal = TermVector.WITH_POSITIONS_OFFSETS; } else if (termVecPositions) { termVecVal = TermVector.WITH_POSITIONS; } else if (termVecOffsets) { termVecVal = TermVector.WITH_OFFSETS; } else if (termVec) { termVecVal = TermVector.YES; } else { termVecVal = TermVector.NO; } storeBytes = config.get("doc.store.body.bytes", false); reuseFields = config.get("doc.reuse.fields", true); // In a multi-rounds run, it is important to reset DocState since settings // of fields may change between rounds, and this is the only way to reset // the cache of all threads. docState = new ThreadLocal<DocState>(); indexProperties = config.get("doc.index.props", false); updateDocIDLimit = config.get("doc.random.id.limit", -1); if (updateDocIDLimit != -1) { r = new Random(179); } }
@Override public void resetInputs() throws IOException { super.resetInputs(); openFile(); }
@Override public synchronized void resetInputs() throws IOException { super.resetInputs(); inputFiles = new Iterator(dataDir); iteration = 0; }
/** * Closes the {@link DocMaker}. The base implementation closes the {@link ContentSource}, and it * can be overridden to do more work (but make sure to call super.close()). */ public void close() throws IOException { source.close(); }
@Override public void resetInputs() throws IOException { super.resetInputs(); is = getInputStream(file); }
/** Returns the number of bytes generated by the content source since last reset. */ public synchronized long getBytesCount() { return source.getBytesCount(); }
/** * Returns the total number of bytes that were generated by the content source defined to that doc * maker. */ public long getTotalBytesCount() { return source.getTotalBytesCount(); }
/** * Creates a {@link Document} object ready for indexing. This method uses the {@link * ContentSource} to get the next document from the source, and creates a {@link Document} object * from the returned fields. If <code>reuseFields</code> was set to true, it will reuse {@link * Document} and {@link Field} instances. */ public Document makeDocument() throws Exception { resetLeftovers(); DocData docData = source.getNextDocData(getDocState().docData); Document doc = createDocument(docData, 0, -1); return doc; }
private ContentSource source(AbstractTreeIterator iterator) { if (iterator instanceof WorkingTreeIterator) return ContentSource.create((WorkingTreeIterator) iterator); return ContentSource.create(reader); }
public static void registerContentSource(ContentSource s) { for (String hostname : s.getHostnames()) { sources.put(hostname, s); } }