public Dataset getDataset() throws ConfigurationException, DaoException { if (createFakeGoldStandard) { if (fakeGoldStandard == null) { Corpus c = env.getConfigurator() .get(Corpus.class, "plain", "language", env.getDefaultLanguage().getLangCode()); try { if (!c.exists()) c.create(); FakeDatasetCreator creator = new FakeDatasetCreator(c); fakeGoldStandard = creator.generate(500); } catch (IOException e) { throw new DaoException(e); } } return fakeGoldStandard; } else { DatasetDao dao = env.getConfigurator().get(DatasetDao.class); List<Dataset> datasets = new ArrayList<Dataset>(); for (String name : datasetNames) { datasets.addAll( dao.getDatasetOrGroup( language, name)); // throws a DaoException if language is incorrect. } return new Dataset(datasets); // merge all datasets together into one. } }
private void initWord2Vec(String name) throws ConfigurationException, IOException, DaoException { Config config = getMetricConfig(name).getConfig("generator"); File model = Word2VecGenerator.getModelFile(config.getString("modelDir"), language); if (skipBuiltMetrics && model.isFile()) { return; } if (config.hasPath("prebuilt") && config.getBoolean("prebuilt")) { if (model.isFile()) { return; } File downloadPath = new File(config.getString("binfile")); if (!downloadPath.isFile()) { throw new ConfigurationException( "word2vec model " + downloadPath.getAbsolutePath() + " cannot be found." + " You must download it from " + config.getString("url") + " into to the wikibrain download directory."); } if (!config.getStringList("languages").contains(language.getLangCode())) { throw new ConfigurationException( "word2vec model " + downloadPath + " does not support language" + language); } if (downloadPath.toString().toLowerCase().endsWith("gz")) { LOG.info("decompressing " + downloadPath + " to " + model); File tmp = File.createTempFile("word2vec", "bin"); try { FileUtils.deleteQuietly(tmp); GZIPInputStream gz = new GZIPInputStream(new FileInputStream(downloadPath)); FileUtils.copyInputStreamToFile(gz, tmp); gz.close(); model.getParentFile().mkdirs(); FileUtils.moveFile(tmp, model); } finally { FileUtils.deleteQuietly(tmp); } } else { FileUtils.copyFile(downloadPath, model); } return; } LinkProbabilityDao lpd = env.getConfigurator().get(LinkProbabilityDao.class); lpd.useCache(true); if (!lpd.isBuilt()) { lpd.build(); } String corpusName = config.getString("corpus"); Corpus corpus = null; if (!corpusName.equals("NONE")) { corpus = env.getConfigurator() .get(Corpus.class, config.getString("corpus"), "language", language.getLangCode()); if (!corpus.exists()) { corpus.create(); } } if (model.isFile() && (corpus == null || model.lastModified() > corpus.getCorpusFile().lastModified())) { return; } if (corpus == null) { throw new ConfigurationException( "word2vec metric " + name + " cannot build or find model!" + "configuration has no corpus, but model not found at " + model + "."); } Word2VecTrainer trainer = new Word2VecTrainer(env.getConfigurator().get(LocalPageDao.class), language); if (config.hasPath("dimensions")) { LOG.info("set number of dimensions to " + config.getInt("dimensions")); trainer.setLayer1Size(config.getInt("dimensions")); } if (config.hasPath("maxWords")) { LOG.info("set maxWords to " + config.getInt("maxWords")); trainer.setMaxWords(config.getInt("maxWords")); } if (config.hasPath("window")) { LOG.info("set window to " + config.getInt("maxWords")); trainer.setWindow(config.getInt("window")); } trainer.setKeepAllArticles(true); trainer.train(corpus.getDirectory()); trainer.save(model); }