public WikidataEntity parse(String json) throws WpParseException { JacksonTermedStatementDocument mwDoc; try { mwDoc = mapper.readValue(json, JacksonTermedStatementDocument.class); } catch (IOException e) { LOG.info("Error parsing: " + json); throw new WpParseException(e); } WikidataEntity record = new WikidataEntity(mwDoc.getEntityId().getId()); // Aliases (multiple per language) for (List<MonolingualTextValue> vlist : mwDoc.getAliases().values()) { if (vlist.isEmpty()) continue; if (!validLanguage(vlist.get(0).getLanguageCode())) continue; Language lang = Language.getByLangCodeLenient(vlist.get(0).getLanguageCode()); record.getAliases().put(lang, new ArrayList<String>()); for (MonolingualTextValue v : vlist) { record.getAliases().get(lang).add(v.getText()); } } // Descriptions (one per language) for (MonolingualTextValue v : mwDoc.getDescriptions().values()) { if (validLanguage(v.getLanguageCode())) { Language lang = Language.getByLangCodeLenient(v.getLanguageCode()); record.getDescriptions().put(lang, v.getText()); } } // Labels (one per language) for (MonolingualTextValue v : mwDoc.getLabels().values()) { if (validLanguage(v.getLanguageCode())) { Language lang = Language.getByLangCodeLenient(v.getLanguageCode()); record.getLabels().put(lang, v.getText()); } } // Claims (only for Item entities) if (mwDoc instanceof JacksonItemDocument) { for (List<JacksonStatement> statements : ((JacksonItemDocument) mwDoc).getJsonClaims().values()) { for (JacksonStatement s : statements) { record.getStatements().add(parseOneClaim(record, s)); } } } return record; }
/** * This method takes care to not load the metric itself, and just deal in names. Once the metric * is loaded, it has already accessed its data files. * * @throws ConfigurationException */ public void deleteDataDirectories() throws ConfigurationException { for (String name : getSubmetrics(metricName)) { File dir = FileUtils.getFile(srDir, name, language.getLangCode()); if (dir.exists()) { LOG.info("deleting metric directory " + dir); FileUtils.deleteQuietly(dir); } } }
private void buildConceptsIfNecessary() throws IOException, ConfigurationException, DaoException { boolean needsConcepts = false; for (String name : getSubmetrics(metricName)) { String type = getMetricType(name); if (type.equals("sparsevector.esa") || type.equals("sparsevector.mostsimilarconcepts")) { needsConcepts = true; } } if (!needsConcepts) { return; } File path = FileUtils.getFile( env.getConfiguration().get().getString("sr.concepts.path"), language.getLangCode() + ".txt"); path.getParentFile().mkdirs(); // Check to see if concepts are already built if (path.isFile() && FileUtils.readLines(path).size() > 1) { return; } LOG.info("building concept file " + path.getAbsolutePath() + " for " + metricName); SRConceptSpaceGenerator gen = new SRConceptSpaceGenerator( language, env.getConfigurator().get(LocalLinkDao.class), env.getConfigurator().get(LocalPageDao.class)); gen.writeConcepts(path); LOG.info( "finished creating concept file " + path.getAbsolutePath() + " with " + FileUtils.readLines(path).size() + " lines"); }
private boolean validLanguage(String langCode) { return Language.hasLangCode(langCode) && langs.containsLanguage(langCode); }
public static void main(String args[]) throws ConfigurationException, IOException, WikiBrainException, DaoException { Options options = new Options(); // Number of Max Results(otherwise take from config) options.addOption( new DefaultOptionBuilder() .hasArg() .withLongOpt("max-results") .withDescription("maximum number of results") .create("r")); // Specify the Datasets options.addOption( new DefaultOptionBuilder() .hasArgs() .withLongOpt("gold") .withDescription("the set of gold standard datasets to train on") .create("g")); // Delete existing data models options.addOption( new DefaultOptionBuilder() .hasArg() .withLongOpt("delete") .withDescription( "delete all existing SR data for the metric and its submetrics (true or false, default is true)") .create("d")); // Specify the Metrics options.addOption( new DefaultOptionBuilder() .hasArg() .withLongOpt("metric") .withDescription("set a local metric") .create("m")); // Row and column ids for most similar caches options.addOption( new DefaultOptionBuilder() .hasArg() .withLongOpt("rowids") .withDescription("page ids for rows of cosimilarity matrices (implies -s)") .create("p")); options.addOption( new DefaultOptionBuilder() .hasArg() .withLongOpt("colids") .withDescription("page ids for columns of cosimilarity matrices (implies -s)") .create("q")); // build the cosimilarity matrix options.addOption( new DefaultOptionBuilder() .withLongOpt("cosimilarity") .withDescription("build cosimilarity matrices") .create("s")); // sets the mode options.addOption( new DefaultOptionBuilder() .withLongOpt("mode") .hasArg() .withDescription("mode: similarity, mostsimilar, or both") .create("o")); // add option for valid most similar ids options.addOption( new DefaultOptionBuilder() .withLongOpt("validMostSimilarIds") .withDescription("Set valid most similar ids") .create("y")); // when building pairwise cosine and ensembles, don't rebuild already built sub-metrics. options.addOption( new DefaultOptionBuilder() .withLongOpt("skip-built") .withDescription("Don't rebuild already built bmetrics (implies -d false)") .create("k")); // when building pairwise cosine and ensembles, don't rebuild already built sub-metrics. options.addOption( new DefaultOptionBuilder() .withLongOpt("fake") .withDescription("Create a fake gold standard for the language.") .create("f")); EnvBuilder.addStandardOptions(options); CommandLineParser parser = new PosixParser(); CommandLine cmd; try { cmd = parser.parse(options, args); } catch (ParseException e) { System.err.println("Invalid option usage: " + e.getMessage()); new HelpFormatter().printHelp("SRBuilder", options); return; } Env env = new EnvBuilder(cmd).build(); String metric = cmd.hasOption("m") ? cmd.getOptionValue("m") : null; SRBuilder builder = new SRBuilder(env, metric); if (cmd.hasOption("g")) { builder.setDatasetNames(Arrays.asList(cmd.getOptionValues("g"))); } if (cmd.hasOption("p")) { builder.setRowIdsFromFile(cmd.getOptionValue("p")); builder.setBuildCosimilarity(true); } if (cmd.hasOption("q")) { builder.setColIdsFromFile(cmd.getOptionValue("q")); builder.setBuildCosimilarity(true); } if (cmd.hasOption("y")) { builder.setValidMostSimilarIdsFromFile(cmd.getOptionValue("y")); } if (cmd.hasOption("s")) { builder.setBuildCosimilarity(true); } if (cmd.hasOption("k")) { builder.setSkipBuiltMetrics(true); builder.setDeleteExistingData(false); } if (cmd.hasOption("d")) { builder.setDeleteExistingData(Boolean.valueOf(cmd.getOptionValue("d"))); } if (cmd.hasOption("o")) { builder.setMode(Mode.valueOf(cmd.getOptionValue("o").toUpperCase())); } if (cmd.hasOption("l")) { builder.setLanguage(Language.getByLangCode(cmd.getOptionValue("l"))); } if (cmd.hasOption("r")) { builder.setMaxResults(Integer.valueOf(cmd.getOptionValue("r"))); } if (cmd.hasOption("f")) { builder.setCreateFakeGoldStandard(true); } builder.build(); }
private void initWord2Vec(String name) throws ConfigurationException, IOException, DaoException { Config config = getMetricConfig(name).getConfig("generator"); File model = Word2VecGenerator.getModelFile(config.getString("modelDir"), language); if (skipBuiltMetrics && model.isFile()) { return; } if (config.hasPath("prebuilt") && config.getBoolean("prebuilt")) { if (model.isFile()) { return; } File downloadPath = new File(config.getString("binfile")); if (!downloadPath.isFile()) { throw new ConfigurationException( "word2vec model " + downloadPath.getAbsolutePath() + " cannot be found." + " You must download it from " + config.getString("url") + " into to the wikibrain download directory."); } if (!config.getStringList("languages").contains(language.getLangCode())) { throw new ConfigurationException( "word2vec model " + downloadPath + " does not support language" + language); } if (downloadPath.toString().toLowerCase().endsWith("gz")) { LOG.info("decompressing " + downloadPath + " to " + model); File tmp = File.createTempFile("word2vec", "bin"); try { FileUtils.deleteQuietly(tmp); GZIPInputStream gz = new GZIPInputStream(new FileInputStream(downloadPath)); FileUtils.copyInputStreamToFile(gz, tmp); gz.close(); model.getParentFile().mkdirs(); FileUtils.moveFile(tmp, model); } finally { FileUtils.deleteQuietly(tmp); } } else { FileUtils.copyFile(downloadPath, model); } return; } LinkProbabilityDao lpd = env.getConfigurator().get(LinkProbabilityDao.class); lpd.useCache(true); if (!lpd.isBuilt()) { lpd.build(); } String corpusName = config.getString("corpus"); Corpus corpus = null; if (!corpusName.equals("NONE")) { corpus = env.getConfigurator() .get(Corpus.class, config.getString("corpus"), "language", language.getLangCode()); if (!corpus.exists()) { corpus.create(); } } if (model.isFile() && (corpus == null || model.lastModified() > corpus.getCorpusFile().lastModified())) { return; } if (corpus == null) { throw new ConfigurationException( "word2vec metric " + name + " cannot build or find model!" + "configuration has no corpus, but model not found at " + model + "."); } Word2VecTrainer trainer = new Word2VecTrainer(env.getConfigurator().get(LocalPageDao.class), language); if (config.hasPath("dimensions")) { LOG.info("set number of dimensions to " + config.getInt("dimensions")); trainer.setLayer1Size(config.getInt("dimensions")); } if (config.hasPath("maxWords")) { LOG.info("set maxWords to " + config.getInt("maxWords")); trainer.setMaxWords(config.getInt("maxWords")); } if (config.hasPath("window")) { LOG.info("set window to " + config.getInt("maxWords")); trainer.setWindow(config.getInt("window")); } trainer.setKeepAllArticles(true); trainer.train(corpus.getDirectory()); trainer.save(model); }
public synchronized SRMetric getMetric(String name) throws ConfigurationException { return env.getConfigurator().get(SRMetric.class, name, "language", language.getLangCode()); }