Пример #1
0
  public WikidataEntity parse(String json) throws WpParseException {
    JacksonTermedStatementDocument mwDoc;

    try {
      mwDoc = mapper.readValue(json, JacksonTermedStatementDocument.class);
    } catch (IOException e) {
      LOG.info("Error parsing: " + json);
      throw new WpParseException(e);
    }

    WikidataEntity record = new WikidataEntity(mwDoc.getEntityId().getId());

    // Aliases (multiple per language)
    for (List<MonolingualTextValue> vlist : mwDoc.getAliases().values()) {
      if (vlist.isEmpty()) continue;
      if (!validLanguage(vlist.get(0).getLanguageCode())) continue;
      Language lang = Language.getByLangCodeLenient(vlist.get(0).getLanguageCode());
      record.getAliases().put(lang, new ArrayList<String>());
      for (MonolingualTextValue v : vlist) {
        record.getAliases().get(lang).add(v.getText());
      }
    }

    // Descriptions (one per language)
    for (MonolingualTextValue v : mwDoc.getDescriptions().values()) {
      if (validLanguage(v.getLanguageCode())) {
        Language lang = Language.getByLangCodeLenient(v.getLanguageCode());
        record.getDescriptions().put(lang, v.getText());
      }
    }

    // Labels (one per language)
    for (MonolingualTextValue v : mwDoc.getLabels().values()) {
      if (validLanguage(v.getLanguageCode())) {
        Language lang = Language.getByLangCodeLenient(v.getLanguageCode());
        record.getLabels().put(lang, v.getText());
      }
    }

    // Claims (only for Item entities)
    if (mwDoc instanceof JacksonItemDocument) {
      for (List<JacksonStatement> statements :
          ((JacksonItemDocument) mwDoc).getJsonClaims().values()) {
        for (JacksonStatement s : statements) {
          record.getStatements().add(parseOneClaim(record, s));
        }
      }
    }

    return record;
  }
Пример #2
0
 /**
  * This method takes care to not load the metric itself, and just deal in names. Once the metric
  * is loaded, it has already accessed its data files.
  *
  * @throws ConfigurationException
  */
 public void deleteDataDirectories() throws ConfigurationException {
   for (String name : getSubmetrics(metricName)) {
     File dir = FileUtils.getFile(srDir, name, language.getLangCode());
     if (dir.exists()) {
       LOG.info("deleting metric directory " + dir);
       FileUtils.deleteQuietly(dir);
     }
   }
 }
Пример #3
0
  private void buildConceptsIfNecessary() throws IOException, ConfigurationException, DaoException {
    boolean needsConcepts = false;
    for (String name : getSubmetrics(metricName)) {
      String type = getMetricType(name);
      if (type.equals("sparsevector.esa") || type.equals("sparsevector.mostsimilarconcepts")) {
        needsConcepts = true;
      }
    }
    if (!needsConcepts) {
      return;
    }
    File path =
        FileUtils.getFile(
            env.getConfiguration().get().getString("sr.concepts.path"),
            language.getLangCode() + ".txt");
    path.getParentFile().mkdirs();

    // Check to see if concepts are already built
    if (path.isFile() && FileUtils.readLines(path).size() > 1) {
      return;
    }

    LOG.info("building concept file " + path.getAbsolutePath() + " for " + metricName);
    SRConceptSpaceGenerator gen =
        new SRConceptSpaceGenerator(
            language,
            env.getConfigurator().get(LocalLinkDao.class),
            env.getConfigurator().get(LocalPageDao.class));
    gen.writeConcepts(path);
    LOG.info(
        "finished creating concept file "
            + path.getAbsolutePath()
            + " with "
            + FileUtils.readLines(path).size()
            + " lines");
  }
Пример #4
0
 private boolean validLanguage(String langCode) {
   return Language.hasLangCode(langCode) && langs.containsLanguage(langCode);
 }
Пример #5
0
  public static void main(String args[])
      throws ConfigurationException, IOException, WikiBrainException, DaoException {
    Options options = new Options();

    // Number of Max Results(otherwise take from config)
    options.addOption(
        new DefaultOptionBuilder()
            .hasArg()
            .withLongOpt("max-results")
            .withDescription("maximum number of results")
            .create("r"));
    // Specify the Datasets
    options.addOption(
        new DefaultOptionBuilder()
            .hasArgs()
            .withLongOpt("gold")
            .withDescription("the set of gold standard datasets to train on")
            .create("g"));

    // Delete existing data models
    options.addOption(
        new DefaultOptionBuilder()
            .hasArg()
            .withLongOpt("delete")
            .withDescription(
                "delete all existing SR data for the metric and its submetrics (true or false, default is true)")
            .create("d"));

    // Specify the Metrics
    options.addOption(
        new DefaultOptionBuilder()
            .hasArg()
            .withLongOpt("metric")
            .withDescription("set a local metric")
            .create("m"));

    // Row and column ids for most similar caches
    options.addOption(
        new DefaultOptionBuilder()
            .hasArg()
            .withLongOpt("rowids")
            .withDescription("page ids for rows of cosimilarity matrices (implies -s)")
            .create("p"));
    options.addOption(
        new DefaultOptionBuilder()
            .hasArg()
            .withLongOpt("colids")
            .withDescription("page ids for columns of cosimilarity matrices (implies -s)")
            .create("q"));

    // build the cosimilarity matrix
    options.addOption(
        new DefaultOptionBuilder()
            .withLongOpt("cosimilarity")
            .withDescription("build cosimilarity matrices")
            .create("s"));

    // sets the mode
    options.addOption(
        new DefaultOptionBuilder()
            .withLongOpt("mode")
            .hasArg()
            .withDescription("mode: similarity, mostsimilar, or both")
            .create("o"));

    // add option for valid most similar ids
    options.addOption(
        new DefaultOptionBuilder()
            .withLongOpt("validMostSimilarIds")
            .withDescription("Set valid most similar ids")
            .create("y"));

    // when building pairwise cosine and ensembles, don't rebuild already built sub-metrics.
    options.addOption(
        new DefaultOptionBuilder()
            .withLongOpt("skip-built")
            .withDescription("Don't rebuild already built bmetrics (implies -d false)")
            .create("k"));

    // when building pairwise cosine and ensembles, don't rebuild already built sub-metrics.
    options.addOption(
        new DefaultOptionBuilder()
            .withLongOpt("fake")
            .withDescription("Create a fake gold standard for the language.")
            .create("f"));

    EnvBuilder.addStandardOptions(options);

    CommandLineParser parser = new PosixParser();
    CommandLine cmd;
    try {
      cmd = parser.parse(options, args);
    } catch (ParseException e) {
      System.err.println("Invalid option usage: " + e.getMessage());
      new HelpFormatter().printHelp("SRBuilder", options);
      return;
    }

    Env env = new EnvBuilder(cmd).build();
    String metric = cmd.hasOption("m") ? cmd.getOptionValue("m") : null;
    SRBuilder builder = new SRBuilder(env, metric);
    if (cmd.hasOption("g")) {
      builder.setDatasetNames(Arrays.asList(cmd.getOptionValues("g")));
    }
    if (cmd.hasOption("p")) {
      builder.setRowIdsFromFile(cmd.getOptionValue("p"));
      builder.setBuildCosimilarity(true);
    }
    if (cmd.hasOption("q")) {
      builder.setColIdsFromFile(cmd.getOptionValue("q"));
      builder.setBuildCosimilarity(true);
    }
    if (cmd.hasOption("y")) {
      builder.setValidMostSimilarIdsFromFile(cmd.getOptionValue("y"));
    }
    if (cmd.hasOption("s")) {
      builder.setBuildCosimilarity(true);
    }
    if (cmd.hasOption("k")) {
      builder.setSkipBuiltMetrics(true);
      builder.setDeleteExistingData(false);
    }
    if (cmd.hasOption("d")) {
      builder.setDeleteExistingData(Boolean.valueOf(cmd.getOptionValue("d")));
    }
    if (cmd.hasOption("o")) {
      builder.setMode(Mode.valueOf(cmd.getOptionValue("o").toUpperCase()));
    }
    if (cmd.hasOption("l")) {
      builder.setLanguage(Language.getByLangCode(cmd.getOptionValue("l")));
    }
    if (cmd.hasOption("r")) {
      builder.setMaxResults(Integer.valueOf(cmd.getOptionValue("r")));
    }
    if (cmd.hasOption("f")) {
      builder.setCreateFakeGoldStandard(true);
    }

    builder.build();
  }
Пример #6
0
  private void initWord2Vec(String name) throws ConfigurationException, IOException, DaoException {
    Config config = getMetricConfig(name).getConfig("generator");
    File model = Word2VecGenerator.getModelFile(config.getString("modelDir"), language);
    if (skipBuiltMetrics && model.isFile()) {
      return;
    }

    if (config.hasPath("prebuilt") && config.getBoolean("prebuilt")) {
      if (model.isFile()) {
        return;
      }
      File downloadPath = new File(config.getString("binfile"));
      if (!downloadPath.isFile()) {
        throw new ConfigurationException(
            "word2vec model "
                + downloadPath.getAbsolutePath()
                + " cannot be found."
                + " You must download it from "
                + config.getString("url")
                + " into to the wikibrain download directory.");
      }
      if (!config.getStringList("languages").contains(language.getLangCode())) {
        throw new ConfigurationException(
            "word2vec model " + downloadPath + " does not support language" + language);
      }
      if (downloadPath.toString().toLowerCase().endsWith("gz")) {
        LOG.info("decompressing " + downloadPath + " to " + model);
        File tmp = File.createTempFile("word2vec", "bin");
        try {
          FileUtils.deleteQuietly(tmp);
          GZIPInputStream gz = new GZIPInputStream(new FileInputStream(downloadPath));
          FileUtils.copyInputStreamToFile(gz, tmp);
          gz.close();
          model.getParentFile().mkdirs();
          FileUtils.moveFile(tmp, model);
        } finally {
          FileUtils.deleteQuietly(tmp);
        }
      } else {
        FileUtils.copyFile(downloadPath, model);
      }
      return;
    }

    LinkProbabilityDao lpd = env.getConfigurator().get(LinkProbabilityDao.class);
    lpd.useCache(true);
    if (!lpd.isBuilt()) {
      lpd.build();
    }

    String corpusName = config.getString("corpus");
    Corpus corpus = null;
    if (!corpusName.equals("NONE")) {
      corpus =
          env.getConfigurator()
              .get(Corpus.class, config.getString("corpus"), "language", language.getLangCode());
      if (!corpus.exists()) {
        corpus.create();
      }
    }

    if (model.isFile()
        && (corpus == null || model.lastModified() > corpus.getCorpusFile().lastModified())) {
      return;
    }
    if (corpus == null) {
      throw new ConfigurationException(
          "word2vec metric "
              + name
              + " cannot build or find model!"
              + "configuration has no corpus, but model not found at "
              + model
              + ".");
    }
    Word2VecTrainer trainer =
        new Word2VecTrainer(env.getConfigurator().get(LocalPageDao.class), language);
    if (config.hasPath("dimensions")) {
      LOG.info("set number of dimensions to " + config.getInt("dimensions"));
      trainer.setLayer1Size(config.getInt("dimensions"));
    }
    if (config.hasPath("maxWords")) {
      LOG.info("set maxWords to " + config.getInt("maxWords"));
      trainer.setMaxWords(config.getInt("maxWords"));
    }
    if (config.hasPath("window")) {
      LOG.info("set window to " + config.getInt("maxWords"));
      trainer.setWindow(config.getInt("window"));
    }
    trainer.setKeepAllArticles(true);
    trainer.train(corpus.getDirectory());
    trainer.save(model);
  }
Пример #7
0
 public synchronized SRMetric getMetric(String name) throws ConfigurationException {
   return env.getConfigurator().get(SRMetric.class, name, "language", language.getLangCode());
 }