public Dataset getDataset() throws ConfigurationException, DaoException { if (createFakeGoldStandard) { if (fakeGoldStandard == null) { Corpus c = env.getConfigurator() .get(Corpus.class, "plain", "language", env.getDefaultLanguage().getLangCode()); try { if (!c.exists()) c.create(); FakeDatasetCreator creator = new FakeDatasetCreator(c); fakeGoldStandard = creator.generate(500); } catch (IOException e) { throw new DaoException(e); } } return fakeGoldStandard; } else { DatasetDao dao = env.getConfigurator().get(DatasetDao.class); List<Dataset> datasets = new ArrayList<Dataset>(); for (String name : datasetNames) { datasets.addAll( dao.getDatasetOrGroup( language, name)); // throws a DaoException if language is incorrect. } return new Dataset(datasets); // merge all datasets together into one. } }
public SRBuilder(Env env, String metricName) throws ConfigurationException { this.env = env; this.language = env.getLanguages().getDefaultLanguage(); this.config = env.getConfiguration(); this.srDir = new File(config.get().getString("sr.metric.path")); datasetNames = config.get().getStringList("sr.dataset.defaultsets"); // Properly resolve the default metric name. this.metricName = env.getConfigurator().resolveComponentName(SRMetric.class, metricName); if (!srDir.isDirectory()) { srDir.mkdirs(); } }
private void buildConceptsIfNecessary() throws IOException, ConfigurationException, DaoException { boolean needsConcepts = false; for (String name : getSubmetrics(metricName)) { String type = getMetricType(name); if (type.equals("sparsevector.esa") || type.equals("sparsevector.mostsimilarconcepts")) { needsConcepts = true; } } if (!needsConcepts) { return; } File path = FileUtils.getFile( env.getConfiguration().get().getString("sr.concepts.path"), language.getLangCode() + ".txt"); path.getParentFile().mkdirs(); // Check to see if concepts are already built if (path.isFile() && FileUtils.readLines(path).size() > 1) { return; } LOG.info("building concept file " + path.getAbsolutePath() + " for " + metricName); SRConceptSpaceGenerator gen = new SRConceptSpaceGenerator( language, env.getConfigurator().get(LocalLinkDao.class), env.getConfigurator().get(LocalPageDao.class)); gen.writeConcepts(path); LOG.info( "finished creating concept file " + path.getAbsolutePath() + " with " + FileUtils.readLines(path).size() + " lines"); }
public Config getMetricConfig(String name) throws ConfigurationException { return env.getConfigurator().getConfig(SRMetric.class, name); }
private void initWord2Vec(String name) throws ConfigurationException, IOException, DaoException { Config config = getMetricConfig(name).getConfig("generator"); File model = Word2VecGenerator.getModelFile(config.getString("modelDir"), language); if (skipBuiltMetrics && model.isFile()) { return; } if (config.hasPath("prebuilt") && config.getBoolean("prebuilt")) { if (model.isFile()) { return; } File downloadPath = new File(config.getString("binfile")); if (!downloadPath.isFile()) { throw new ConfigurationException( "word2vec model " + downloadPath.getAbsolutePath() + " cannot be found." + " You must download it from " + config.getString("url") + " into to the wikibrain download directory."); } if (!config.getStringList("languages").contains(language.getLangCode())) { throw new ConfigurationException( "word2vec model " + downloadPath + " does not support language" + language); } if (downloadPath.toString().toLowerCase().endsWith("gz")) { LOG.info("decompressing " + downloadPath + " to " + model); File tmp = File.createTempFile("word2vec", "bin"); try { FileUtils.deleteQuietly(tmp); GZIPInputStream gz = new GZIPInputStream(new FileInputStream(downloadPath)); FileUtils.copyInputStreamToFile(gz, tmp); gz.close(); model.getParentFile().mkdirs(); FileUtils.moveFile(tmp, model); } finally { FileUtils.deleteQuietly(tmp); } } else { FileUtils.copyFile(downloadPath, model); } return; } LinkProbabilityDao lpd = env.getConfigurator().get(LinkProbabilityDao.class); lpd.useCache(true); if (!lpd.isBuilt()) { lpd.build(); } String corpusName = config.getString("corpus"); Corpus corpus = null; if (!corpusName.equals("NONE")) { corpus = env.getConfigurator() .get(Corpus.class, config.getString("corpus"), "language", language.getLangCode()); if (!corpus.exists()) { corpus.create(); } } if (model.isFile() && (corpus == null || model.lastModified() > corpus.getCorpusFile().lastModified())) { return; } if (corpus == null) { throw new ConfigurationException( "word2vec metric " + name + " cannot build or find model!" + "configuration has no corpus, but model not found at " + model + "."); } Word2VecTrainer trainer = new Word2VecTrainer(env.getConfigurator().get(LocalPageDao.class), language); if (config.hasPath("dimensions")) { LOG.info("set number of dimensions to " + config.getInt("dimensions")); trainer.setLayer1Size(config.getInt("dimensions")); } if (config.hasPath("maxWords")) { LOG.info("set maxWords to " + config.getInt("maxWords")); trainer.setMaxWords(config.getInt("maxWords")); } if (config.hasPath("window")) { LOG.info("set window to " + config.getInt("maxWords")); trainer.setWindow(config.getInt("window")); } trainer.setKeepAllArticles(true); trainer.train(corpus.getDirectory()); trainer.save(model); }
public synchronized SRMetric getMetric(String name) throws ConfigurationException { return env.getConfigurator().get(SRMetric.class, name, "language", language.getLangCode()); }