Exemple #1
0
 public Dataset getDataset() throws ConfigurationException, DaoException {
   if (createFakeGoldStandard) {
     if (fakeGoldStandard == null) {
       Corpus c =
           env.getConfigurator()
               .get(Corpus.class, "plain", "language", env.getDefaultLanguage().getLangCode());
       try {
         if (!c.exists()) c.create();
         FakeDatasetCreator creator = new FakeDatasetCreator(c);
         fakeGoldStandard = creator.generate(500);
       } catch (IOException e) {
         throw new DaoException(e);
       }
     }
     return fakeGoldStandard;
   } else {
     DatasetDao dao = env.getConfigurator().get(DatasetDao.class);
     List<Dataset> datasets = new ArrayList<Dataset>();
     for (String name : datasetNames) {
       datasets.addAll(
           dao.getDatasetOrGroup(
               language, name)); // throws a DaoException if language is incorrect.
     }
     return new Dataset(datasets); // merge all datasets together into one.
   }
 }
Exemple #2
0
  private void initWord2Vec(String name) throws ConfigurationException, IOException, DaoException {
    Config config = getMetricConfig(name).getConfig("generator");
    File model = Word2VecGenerator.getModelFile(config.getString("modelDir"), language);
    if (skipBuiltMetrics && model.isFile()) {
      return;
    }

    if (config.hasPath("prebuilt") && config.getBoolean("prebuilt")) {
      if (model.isFile()) {
        return;
      }
      File downloadPath = new File(config.getString("binfile"));
      if (!downloadPath.isFile()) {
        throw new ConfigurationException(
            "word2vec model "
                + downloadPath.getAbsolutePath()
                + " cannot be found."
                + " You must download it from "
                + config.getString("url")
                + " into to the wikibrain download directory.");
      }
      if (!config.getStringList("languages").contains(language.getLangCode())) {
        throw new ConfigurationException(
            "word2vec model " + downloadPath + " does not support language" + language);
      }
      if (downloadPath.toString().toLowerCase().endsWith("gz")) {
        LOG.info("decompressing " + downloadPath + " to " + model);
        File tmp = File.createTempFile("word2vec", "bin");
        try {
          FileUtils.deleteQuietly(tmp);
          GZIPInputStream gz = new GZIPInputStream(new FileInputStream(downloadPath));
          FileUtils.copyInputStreamToFile(gz, tmp);
          gz.close();
          model.getParentFile().mkdirs();
          FileUtils.moveFile(tmp, model);
        } finally {
          FileUtils.deleteQuietly(tmp);
        }
      } else {
        FileUtils.copyFile(downloadPath, model);
      }
      return;
    }

    LinkProbabilityDao lpd = env.getConfigurator().get(LinkProbabilityDao.class);
    lpd.useCache(true);
    if (!lpd.isBuilt()) {
      lpd.build();
    }

    String corpusName = config.getString("corpus");
    Corpus corpus = null;
    if (!corpusName.equals("NONE")) {
      corpus =
          env.getConfigurator()
              .get(Corpus.class, config.getString("corpus"), "language", language.getLangCode());
      if (!corpus.exists()) {
        corpus.create();
      }
    }

    if (model.isFile()
        && (corpus == null || model.lastModified() > corpus.getCorpusFile().lastModified())) {
      return;
    }
    if (corpus == null) {
      throw new ConfigurationException(
          "word2vec metric "
              + name
              + " cannot build or find model!"
              + "configuration has no corpus, but model not found at "
              + model
              + ".");
    }
    Word2VecTrainer trainer =
        new Word2VecTrainer(env.getConfigurator().get(LocalPageDao.class), language);
    if (config.hasPath("dimensions")) {
      LOG.info("set number of dimensions to " + config.getInt("dimensions"));
      trainer.setLayer1Size(config.getInt("dimensions"));
    }
    if (config.hasPath("maxWords")) {
      LOG.info("set maxWords to " + config.getInt("maxWords"));
      trainer.setMaxWords(config.getInt("maxWords"));
    }
    if (config.hasPath("window")) {
      LOG.info("set window to " + config.getInt("maxWords"));
      trainer.setWindow(config.getInt("window"));
    }
    trainer.setKeepAllArticles(true);
    trainer.train(corpus.getDirectory());
    trainer.save(model);
  }