コード例 #1
0
ファイル: SRBuilder.java プロジェクト: monkey2000/wikibrain
 /**
  * This method takes care to not load the metric itself, and just deal in names. Once the metric
  * is loaded, it has already accessed its data files.
  *
  * @throws ConfigurationException
  */
 public void deleteDataDirectories() throws ConfigurationException {
   for (String name : getSubmetrics(metricName)) {
     File dir = FileUtils.getFile(srDir, name, language.getLangCode());
     if (dir.exists()) {
       LOG.info("deleting metric directory " + dir);
       FileUtils.deleteQuietly(dir);
     }
   }
 }
コード例 #2
0
ファイル: SRBuilder.java プロジェクト: monkey2000/wikibrain
  private void buildConceptsIfNecessary() throws IOException, ConfigurationException, DaoException {
    boolean needsConcepts = false;
    for (String name : getSubmetrics(metricName)) {
      String type = getMetricType(name);
      if (type.equals("sparsevector.esa") || type.equals("sparsevector.mostsimilarconcepts")) {
        needsConcepts = true;
      }
    }
    if (!needsConcepts) {
      return;
    }
    File path =
        FileUtils.getFile(
            env.getConfiguration().get().getString("sr.concepts.path"),
            language.getLangCode() + ".txt");
    path.getParentFile().mkdirs();

    // Check to see if concepts are already built
    if (path.isFile() && FileUtils.readLines(path).size() > 1) {
      return;
    }

    LOG.info("building concept file " + path.getAbsolutePath() + " for " + metricName);
    SRConceptSpaceGenerator gen =
        new SRConceptSpaceGenerator(
            language,
            env.getConfigurator().get(LocalLinkDao.class),
            env.getConfigurator().get(LocalPageDao.class));
    gen.writeConcepts(path);
    LOG.info(
        "finished creating concept file "
            + path.getAbsolutePath()
            + " with "
            + FileUtils.readLines(path).size()
            + " lines");
  }
コード例 #3
0
ファイル: SRBuilder.java プロジェクト: monkey2000/wikibrain
  private void initWord2Vec(String name) throws ConfigurationException, IOException, DaoException {
    Config config = getMetricConfig(name).getConfig("generator");
    File model = Word2VecGenerator.getModelFile(config.getString("modelDir"), language);
    if (skipBuiltMetrics && model.isFile()) {
      return;
    }

    if (config.hasPath("prebuilt") && config.getBoolean("prebuilt")) {
      if (model.isFile()) {
        return;
      }
      File downloadPath = new File(config.getString("binfile"));
      if (!downloadPath.isFile()) {
        throw new ConfigurationException(
            "word2vec model "
                + downloadPath.getAbsolutePath()
                + " cannot be found."
                + " You must download it from "
                + config.getString("url")
                + " into to the wikibrain download directory.");
      }
      if (!config.getStringList("languages").contains(language.getLangCode())) {
        throw new ConfigurationException(
            "word2vec model " + downloadPath + " does not support language" + language);
      }
      if (downloadPath.toString().toLowerCase().endsWith("gz")) {
        LOG.info("decompressing " + downloadPath + " to " + model);
        File tmp = File.createTempFile("word2vec", "bin");
        try {
          FileUtils.deleteQuietly(tmp);
          GZIPInputStream gz = new GZIPInputStream(new FileInputStream(downloadPath));
          FileUtils.copyInputStreamToFile(gz, tmp);
          gz.close();
          model.getParentFile().mkdirs();
          FileUtils.moveFile(tmp, model);
        } finally {
          FileUtils.deleteQuietly(tmp);
        }
      } else {
        FileUtils.copyFile(downloadPath, model);
      }
      return;
    }

    LinkProbabilityDao lpd = env.getConfigurator().get(LinkProbabilityDao.class);
    lpd.useCache(true);
    if (!lpd.isBuilt()) {
      lpd.build();
    }

    String corpusName = config.getString("corpus");
    Corpus corpus = null;
    if (!corpusName.equals("NONE")) {
      corpus =
          env.getConfigurator()
              .get(Corpus.class, config.getString("corpus"), "language", language.getLangCode());
      if (!corpus.exists()) {
        corpus.create();
      }
    }

    if (model.isFile()
        && (corpus == null || model.lastModified() > corpus.getCorpusFile().lastModified())) {
      return;
    }
    if (corpus == null) {
      throw new ConfigurationException(
          "word2vec metric "
              + name
              + " cannot build or find model!"
              + "configuration has no corpus, but model not found at "
              + model
              + ".");
    }
    Word2VecTrainer trainer =
        new Word2VecTrainer(env.getConfigurator().get(LocalPageDao.class), language);
    if (config.hasPath("dimensions")) {
      LOG.info("set number of dimensions to " + config.getInt("dimensions"));
      trainer.setLayer1Size(config.getInt("dimensions"));
    }
    if (config.hasPath("maxWords")) {
      LOG.info("set maxWords to " + config.getInt("maxWords"));
      trainer.setMaxWords(config.getInt("maxWords"));
    }
    if (config.hasPath("window")) {
      LOG.info("set window to " + config.getInt("maxWords"));
      trainer.setWindow(config.getInt("window"));
    }
    trainer.setKeepAllArticles(true);
    trainer.train(corpus.getDirectory());
    trainer.save(model);
  }
コード例 #4
0
ファイル: SRBuilder.java プロジェクト: monkey2000/wikibrain
 public synchronized SRMetric getMetric(String name) throws ConfigurationException {
   return env.getConfigurator().get(SRMetric.class, name, "language", language.getLangCode());
 }