@Override
 public void setConfig(Config config) {
   super.setConfig(config);
   keepImages = config.get("keep.image.only.docs", true);
   String fileName = config.get("docs.file", null);
   if (fileName == null) {
     throw new IllegalArgumentException("docs.file must be set");
   }
   file = new File(fileName).getAbsoluteFile();
 }
  @Override
  public void setConfig(Config config) {
    super.setConfig(config);

    File workDir = new File(config.get("work.dir", "work"));
    String d = config.get("docs.dir", "dir-out");
    dataDir = new File(d);
    if (!dataDir.isAbsolute()) {
      dataDir = new File(workDir, d);
    }

    inputFiles = new Iterator(dataDir);

    if (inputFiles == null) {
      throw new RuntimeException("No txt files in dataDir: " + dataDir.getAbsolutePath());
    }
  }
Пример #3
0
 @Override
 public void setConfig(Config config) {
   super.setConfig(config);
   // dirs
   File workDir = new File(config.get("work.dir", "work"));
   String d = config.get("docs.dir", "trec");
   dataDir = new File(d);
   if (!dataDir.isAbsolute()) {
     dataDir = new File(workDir, d);
   }
   // files
   collectFiles(dataDir, inputFiles);
   if (inputFiles.size() == 0) {
     throw new IllegalArgumentException("No files in dataDir: " + dataDir);
   }
   // trec doc parser
   try {
     String trecDocParserClassName =
         config.get("trec.doc.parser", "org.apache.lucene.benchmark.byTask.feeds.TrecGov2Parser");
     trecDocParser =
         Class.forName(trecDocParserClassName).asSubclass(TrecDocParser.class).newInstance();
   } catch (Exception e) {
     // Should not get here. Throw runtime exception.
     throw new RuntimeException(e);
   }
   // html parser
   try {
     String htmlParserClassName =
         config.get("html.parser", "org.apache.lucene.benchmark.byTask.feeds.DemoHTMLParser");
     htmlParser = Class.forName(htmlParserClassName).asSubclass(HTMLParser.class).newInstance();
   } catch (Exception e) {
     // Should not get here. Throw runtime exception.
     throw new RuntimeException(e);
   }
   // encoding
   if (encoding == null) {
     encoding = "ISO-8859-1";
   }
   // iteration exclusion in doc name
   excludeDocnameIteration = config.get("content.source.excludeIteration", false);
 }
Пример #4
0
 @Override
 public void setConfig(Config config) {
   super.setConfig(config);
   String fileName = config.get("docs.file", null);
   if (fileName == null) {
     throw new IllegalArgumentException("docs.file must be set");
   }
   file = Paths.get(fileName).toAbsolutePath();
   if (encoding == null) {
     encoding = IOUtils.UTF_8;
   }
 }
Пример #5
0
  /** Set the configuration parameters of this doc maker. */
  public void setConfig(Config config) {
    this.config = config;
    try {
      String sourceClass =
          config.get("content.source", "org.apache.lucene.benchmark.byTask.feeds.SingleDocSource");
      source = Class.forName(sourceClass).asSubclass(ContentSource.class).newInstance();
      source.setConfig(config);
    } catch (Exception e) {
      // Should not get here. Throw runtime exception.
      throw new RuntimeException(e);
    }

    boolean stored = config.get("doc.stored", false);
    boolean bodyStored = config.get("doc.body.stored", stored);
    boolean tokenized = config.get("doc.tokenized", true);
    boolean bodyTokenized = config.get("doc.body.tokenized", tokenized);
    boolean norms = config.get("doc.tokenized.norms", false);
    boolean bodyNorms = config.get("doc.body.tokenized.norms", true);
    boolean termVec = config.get("doc.term.vector", false);
    storeVal = (stored ? Field.Store.YES : Field.Store.NO);
    bodyStoreVal = (bodyStored ? Field.Store.YES : Field.Store.NO);
    if (tokenized) {
      indexVal = norms ? Index.ANALYZED : Index.ANALYZED_NO_NORMS;
    } else {
      indexVal = norms ? Index.NOT_ANALYZED : Index.NOT_ANALYZED_NO_NORMS;
    }

    if (bodyTokenized) {
      bodyIndexVal = bodyNorms ? Index.ANALYZED : Index.ANALYZED_NO_NORMS;
    } else {
      bodyIndexVal = bodyNorms ? Index.NOT_ANALYZED : Index.NOT_ANALYZED_NO_NORMS;
    }

    boolean termVecPositions = config.get("doc.term.vector.positions", false);
    boolean termVecOffsets = config.get("doc.term.vector.offsets", false);
    if (termVecPositions && termVecOffsets) {
      termVecVal = TermVector.WITH_POSITIONS_OFFSETS;
    } else if (termVecPositions) {
      termVecVal = TermVector.WITH_POSITIONS;
    } else if (termVecOffsets) {
      termVecVal = TermVector.WITH_OFFSETS;
    } else if (termVec) {
      termVecVal = TermVector.YES;
    } else {
      termVecVal = TermVector.NO;
    }
    storeBytes = config.get("doc.store.body.bytes", false);

    reuseFields = config.get("doc.reuse.fields", true);

    // In a multi-rounds run, it is important to reset DocState since settings
    // of fields may change between rounds, and this is the only way to reset
    // the cache of all threads.
    docState = new ThreadLocal<DocState>();

    indexProperties = config.get("doc.index.props", false);

    updateDocIDLimit = config.get("doc.random.id.limit", -1);
    if (updateDocIDLimit != -1) {
      r = new Random(179);
    }
  }