public DateTimeDatasetVersionFinder(FileSystem fs, Config config) { super(fs); Preconditions.checkArgument( config.hasPath(DATE_TIME_PATTERN_KEY), "Missing required property " + DATE_TIME_PATTERN_KEY); String pattern = config.getString(DATE_TIME_PATTERN_KEY); if (config.hasPath(OPTIONAL_GLOB_PATTERN_TIMEZONE_KEY)) { this.globPattern = new Path(config.getString(OPTIONAL_GLOB_PATTERN_TIMEZONE_KEY)); } else { this.globPattern = new Path(pattern.replaceAll("[^/]+", "*")); } LOGGER.debug( String.format( "Setting timezone for patthern: %s. By default it is %s", pattern, DEFAULT_DATE_TIME_PATTERN_TIMEZONE)); if (config.hasPath(DATE_TIME_PATTERN_TIMEZONE_KEY)) { this.formatter = DateTimeFormat.forPattern(pattern) .withZone(DateTimeZone.forID(config.getString(DATE_TIME_PATTERN_TIMEZONE_KEY))); } else { this.formatter = DateTimeFormat.forPattern(pattern) .withZone(DateTimeZone.forID(DEFAULT_DATE_TIME_PATTERN_TIMEZONE)); } this.datePartitionPattern = pattern; }
/** * * * <ul> * <li>The constructor takes in a dataset {@link Config} which MUST have a comma separated list * of destination formats at key, {@value #DESTINATION_CONVERSION_FORMATS_KEY} * <li>Conversion configuration for a format can be set by using destination format as prefix. * <li>E.g. If {@value #DESTINATION_CONVERSION_FORMATS_KEY}=flattenedOrc,nestedOrc.<br> * The destination table name for flattened ORC is set at flattenedOrc.tableName<br> * And the destination table name for nested ORC is set at nestedOrc.tableName * </ul> * * @param fs * @param clientPool * @param table * @param config */ public ConvertibleHiveDataset( FileSystem fs, HiveMetastoreClientPool clientPool, Table table, Config config) { super(fs, clientPool, table, config); Preconditions.checkArgument( config.hasPath(DESTINATION_CONVERSION_FORMATS_KEY), String.format( "Atleast one destination format should be specified at %s.%s. If you do not intend to convert this dataset set %s.%s to true", super.properties.getProperty(HiveDatasetFinder.HIVE_DATASET_CONFIG_PREFIX_KEY, ""), DESTINATION_CONVERSION_FORMATS_KEY, super.properties.getProperty(HiveDatasetFinder.HIVE_DATASET_CONFIG_PREFIX_KEY, ""), HiveDatasetFinder.HIVE_DATASET_IS_BLACKLISTED_KEY)); // value for DESTINATION_CONVERSION_FORMATS_KEY can be a TypeSafe list or a comma separated list // of string this.destFormats = Sets.newHashSet(ConfigUtils.getStringList(config, DESTINATION_CONVERSION_FORMATS_KEY)); // For each format create ConversionConfig and store it in a Map<format,conversionConfig> this.destConversionConfigs = Maps.newHashMap(); for (String format : this.destFormats) { if (config.hasPath(format)) { this.destConversionConfigs.put( format, new ConversionConfig(config.getConfig(format), table, format)); } } }
private backtype.storm.Config getStormConfig(com.typesafe.config.Config config) { backtype.storm.Config conf = new backtype.storm.Config(); conf.put(RichSpoutBatchExecutor.MAX_BATCH_SIZE_CONF, Int.box(64 * 1024)); conf.put(backtype.storm.Config.TOPOLOGY_RECEIVER_BUFFER_SIZE, Int.box(8)); conf.put(backtype.storm.Config.TOPOLOGY_TRANSFER_BUFFER_SIZE, Int.box(32)); conf.put(backtype.storm.Config.TOPOLOGY_EXECUTOR_RECEIVE_BUFFER_SIZE, Int.box(16384)); conf.put(backtype.storm.Config.TOPOLOGY_EXECUTOR_SEND_BUFFER_SIZE, Int.box(16384)); conf.put(backtype.storm.Config.NIMBUS_THRIFT_MAX_BUFFER_SIZE, Int.box(20480000)); String nimbusHost = STORM_NIMBUS_HOST_DEFAULT; if (environment.config().hasPath(STORM_NIMBUS_HOST_CONF_PATH)) { nimbusHost = environment.config().getString(STORM_NIMBUS_HOST_CONF_PATH); LOG.info("Overriding {} = {}", STORM_NIMBUS_HOST_CONF_PATH, nimbusHost); } else { LOG.info("Using default {} = {}", STORM_NIMBUS_HOST_CONF_PATH, STORM_NIMBUS_HOST_DEFAULT); } Integer nimbusThriftPort = STORM_NIMBUS_THRIFT_DEFAULT; if (environment.config().hasPath(STORM_NIMBUS_THRIFT_CONF_PATH)) { nimbusThriftPort = environment.config().getInt(STORM_NIMBUS_THRIFT_CONF_PATH); LOG.info("Overriding {} = {}", STORM_NIMBUS_THRIFT_CONF_PATH, nimbusThriftPort); } else { LOG.info("Using default {} = {}", STORM_NIMBUS_THRIFT_CONF_PATH, STORM_NIMBUS_THRIFT_DEFAULT); } conf.put(backtype.storm.Config.NIMBUS_HOST, nimbusHost); conf.put(backtype.storm.Config.NIMBUS_THRIFT_PORT, nimbusThriftPort); conf.put( Config.STORM_THRIFT_TRANSPORT_PLUGIN, "backtype.storm.security.auth.SimpleTransportPlugin"); if (config.hasPath(WORKERS)) { conf.setNumWorkers(config.getInt(WORKERS)); } if (config.hasPath(TOPOLOGY_MESSAGE_TIMEOUT_SECS)) { conf.put(TOPOLOGY_MESSAGE_TIMEOUT_SECS, config.getInt(TOPOLOGY_MESSAGE_TIMEOUT_SECS)); } return conf; }
private ConversionConfig(Config config, Table table, String destinationFormat) { Preconditions.checkArgument( config.hasPath(DESTINATION_TABLE_KEY), String.format("Key %s.%s is not specified", destinationFormat, DESTINATION_TABLE_KEY)); Preconditions.checkArgument( config.hasPath(DESTINATION_DB_KEY), String.format("Key %s.%s is not specified", destinationFormat, DESTINATION_DB_KEY)); Preconditions.checkArgument( config.hasPath(DESTINATION_DATA_PATH_KEY), String.format( "Key %s.%s is not specified", destinationFormat, DESTINATION_DATA_PATH_KEY)); // Required this.destinationFormat = destinationFormat; this.destinationTableName = resolveTemplate(config.getString(DESTINATION_TABLE_KEY), table); this.destinationStagingTableName = String.format( "%s_%s", this.destinationTableName, "staging"); // Fixed and non-configurable this.destinationDbName = resolveTemplate(config.getString(DESTINATION_DB_KEY), table); this.destinationDataPath = resolveTemplate(config.getString(DESTINATION_DATA_PATH_KEY), table); // Optional this.clusterBy = ConfigUtils.getStringList(config, CLUSTER_BY_KEY); this.numBuckets = Optional.fromNullable(ConfigUtils.getInt(config, NUM_BUCKETS_KEY, null)); this.hiveRuntimeProperties = ConfigUtils.configToProperties( ConfigUtils.getConfig( config, HIVE_RUNTIME_PROPERTIES_KEY_PREFIX, ConfigFactory.empty())); this.evolutionEnabled = ConfigUtils.getBoolean(config, EVOLUTION_ENABLED, false); this.rowLimit = Optional.fromNullable(ConfigUtils.getInt(config, ROW_LIMIT_KEY, null)); this.sourceDataPathIdentifier = ConfigUtils.getStringList(config, SOURCE_DATA_PATH_IDENTIFIER_KEY); }
public void buildMetric(String name) throws ConfigurationException, DaoException, IOException { LOG.info("building component metric " + name); String type = getMetricType(name); if (type.equals("densevector.word2vec")) { initWord2Vec(name); } SRMetric metric = getMetric(name); if (type.equals("ensemble")) { ((EnsembleMetric) metric).setTrainSubmetrics(false); // Do it by hand } else if (type.equals("sparsevector.mostsimilarconcepts")) { if (mode == Mode.SIMILARITY) { LOG.warn("metric " + name + " of type " + type + " requires mostSimilar... training BOTH"); mode = Mode.BOTH; } throw new UnsupportedOperationException("This block needs to occur earlier."); } else if (type.equals("milnewitten")) { ((MilneWittenMetric) metric).setTrainSubmetrics(false); } if (metric instanceof BaseSRMetric) { ((BaseSRMetric) metric).setBuildMostSimilarCache(buildCosimilarity); } Dataset ds = getDataset(); if (mode == Mode.SIMILARITY || mode == Mode.BOTH) { if (skipBuiltMetrics && metric.similarityIsTrained()) { LOG.info("metric " + name + " similarity() is already trained... skipping"); } else { metric.trainSimilarity(ds); } } if (mode == Mode.MOSTSIMILAR || mode == Mode.BOTH) { if (skipBuiltMetrics && metric.mostSimilarIsTrained()) { LOG.info("metric " + name + " mostSimilar() is already trained... skipping"); } else { Config config = getMetricConfig(name); int n = maxResults * EnsembleMetric.SEARCH_MULTIPLIER; TIntSet validIds = validMostSimilarIds; if (config.hasPath("maxResults")) { n = config.getInt("maxResults"); } if (config.hasPath("mostSimilarConcepts")) { String path = String.format( "%s/%s.txt", config.getString("mostSimilarConcepts"), metric.getLanguage().getLangCode()); validIds = readIds(path); } metric.trainMostSimilar(ds, n, validIds); } } metric.write(); }
public static void main(String[] args) { Config conf = ConfigFactory.load("test"); if (conf.hasPath("greed")) System.out.println(conf.getConfig("greed").toString()); System.out.println(conf.getConfig("greed.lang.cpp").resolve().toString()); conf = ConfigFactory.parseFile( new File(System.getProperty("user.dir") + "/src/main/resources/default.conf")) .resolve(); if (conf.hasPath("greed")) System.out.println(conf.getConfig("greed").toString()); }
public Sample( CommandBuilder builder, Config config, Command parent, Command child, MorphlineContext context) { super(builder, config, parent, child, context); this.probability = getConfigs().getDouble(config, "probability", 1.0); if (probability < 0.0) { throw new MorphlineCompilationException( "Probability must not be negative: " + probability, config); } if (probability >= 1.0) { this.prng = null; } else { if (config.hasPath("seed")) { long seed = getConfigs().getLong(config, "seed"); this.prng = new Well19937c(seed); // non-secure & fast } else { Random rand = new SecureRandom(); int[] seed = new int[624]; for (int i = 0; i < seed.length; i++) { seed[i] = rand.nextInt(); } this.prng = new Well19937c(seed); // non-secure & fast } } validateArguments(); }
/** * Returns a list of metric names (including the passed in name) that are a submetric of the * specified metric. The metrics are topologically sorted by dependency, so the parent metric will * appear last. * * @param parentName * @return * @throws ConfigurationException */ public List<String> getSubmetrics(String parentName) throws ConfigurationException { String type = getMetricType(parentName); Config config = getMetricConfig(parentName); List<String> toAdd = new ArrayList<String>(); if (type.equals("ensemble") || type.equals("simple-ensemble")) { for (String child : config.getStringList("metrics")) { toAdd.addAll(getSubmetrics(child)); toAdd.add(child); } } else if (type.equals("sparsevector.mostsimilarconcepts")) { toAdd.addAll(getSubmetrics(config.getString("generator.basemetric"))); } else if (type.equals("milnewitten")) { toAdd.add(config.getString("inlink")); toAdd.add(config.getString("outlink")); } else if (config.hasPath("reliesOn")) { toAdd.addAll(config.getStringList("reliesOn")); } toAdd.add(parentName); List<String> results = new ArrayList<String>(); // Make sure things only appear once. We save the FIRST time they appear to preserve // dependencies. for (String name : toAdd) { if (!results.contains(name)) { results.add(name); } } return results; }
public GlobModTimeDatasetVersionFinder(FileSystem fs, Config config) { this( fs, config.hasPath(VERSION_FINDER_GLOB_PATTERN_KEY) ? new Path(config.getString(VERSION_FINDER_GLOB_PATTERN_KEY)) : new Path("*")); }
public YarnService( Config config, String applicationName, String applicationId, YarnConfiguration yarnConfiguration, FileSystem fs, EventBus eventBus) throws Exception { this.applicationName = applicationName; this.applicationId = applicationId; this.config = config; this.eventBus = eventBus; this.gobblinMetrics = config.getBoolean(ConfigurationKeys.METRICS_ENABLED_KEY) ? Optional.of(buildGobblinMetrics()) : Optional.<GobblinMetrics>absent(); this.eventSubmitter = config.getBoolean(ConfigurationKeys.METRICS_ENABLED_KEY) ? Optional.of(buildEventSubmitter()) : Optional.<EventSubmitter>absent(); this.yarnConfiguration = yarnConfiguration; this.fs = fs; this.amrmClientAsync = closer.register( AMRMClientAsync.createAMRMClientAsync(1000, new AMRMClientCallbackHandler())); this.amrmClientAsync.init(this.yarnConfiguration); this.nmClientAsync = closer.register(NMClientAsync.createNMClientAsync(new NMClientCallbackHandler())); this.nmClientAsync.init(this.yarnConfiguration); this.initialContainers = config.getInt(GobblinYarnConfigurationKeys.INITIAL_CONTAINERS_KEY); this.requestedContainerMemoryMbs = config.getInt(GobblinYarnConfigurationKeys.CONTAINER_MEMORY_MBS_KEY); this.requestedContainerCores = config.getInt(GobblinYarnConfigurationKeys.CONTAINER_CORES_KEY); this.containerHostAffinityEnabled = config.getBoolean(GobblinYarnConfigurationKeys.CONTAINER_HOST_AFFINITY_ENABLED); this.helixInstanceMaxRetries = config.getInt(GobblinYarnConfigurationKeys.HELIX_INSTANCE_MAX_RETRIES); this.containerJvmArgs = config.hasPath(GobblinYarnConfigurationKeys.CONTAINER_JVM_ARGS_KEY) ? Optional.of(config.getString(GobblinYarnConfigurationKeys.CONTAINER_JVM_ARGS_KEY)) : Optional.<String>absent(); this.containerLaunchExecutor = Executors.newFixedThreadPool( 10, ExecutorsUtils.newThreadFactory( Optional.of(LOGGER), Optional.of("ContainerLaunchExecutor"))); this.tokens = getSecurityTokens(); }
public int getCpuSetSize() { if (vampires.hasPath("cpuSetSize")) { return vampires.getInt("cpuSetSize"); } else { LOG.error("missing executor cpuSetSize"); } return 1; }
public CandidateFilterFactory() { Config config = ConfigUtils.getDefaultConfig(); lshSampleRatio = config.getDouble("model.lsh.sample-ratio"); numHashes = config.getInt("model.lsh.num-hashes"); candidateFilterClassName = config.hasPath("serving-layer.candidate-filter-class") ? config.getString("serving-layer.candidate-filter-class") : null; }
@Override public void configure(Config config, String key) { fieldName1 = config.getString(key + ".field1"); fieldName2 = config.getString(key + ".field2"); if (config.hasPath(key + ".keys")) { keys = config.getStringList(key + ".keys"); } key2 = config.getString(key + ".key2"); constant = config.getDouble((key + ".constant")); outputName = config.getString(key + ".output"); }
public List<String> getExecutors() { if (vampires.hasPath("executors")) { return vampires .getStringList("executors") .stream() .map(String::toUpperCase) .collect(Collectors.toList()); } else { LOG.error("missing executors config value"); throw new IllegalArgumentException("missing executors config value"); } }
@Test public void testPropertiesToConfigWithPrefix() { Properties properties = new Properties(); properties.setProperty("k1.kk1", "v1"); properties.setProperty("k1.kk2", "v2"); properties.setProperty("k2.kk", "v3"); Config conf = ConfigUtils.propertiesToConfig(properties, Optional.of("k1")); Assert.assertEquals(conf.getString("k1.kk1"), "v1"); Assert.assertEquals(conf.getString("k1.kk2"), "v2"); Assert.assertFalse(conf.hasPath("k2.kk"), "Should not contain key k2.kk"); }
private void cacheManagerPeerListenerFactory(final Config conf) { if (conf.hasPath("class")) { eh.addCacheManagerPeerListenerFactory( newFactory("ehcache.cacheManagerPeerListenerFactory", conf, FactoryConfiguration::new)); } else { each( conf, (name, c) -> { eh.addCacheManagerPeerListenerFactory( newFactory( "ehcache.cacheManagerPeerListenerFactory." + name, c, FactoryConfiguration::new)); }); } }
@Override public void configure(final Env env, final Config config, final Binder binder) { if (config.hasPath("server.module")) { try { delegate = (Jooby.Module) getClass() .getClassLoader() .loadClass(config.getString("server.module")) .newInstance(); delegate.configure(env, config, binder); } catch (Exception ex) { throw new IllegalStateException( "No " + Server.class.getName() + " implementation was found.", ex); } } }
@Nonnull private Order<Rating> getRatingOrder(Config cfg) throws SpecificationException { Order<Rating> order = new RandomOrder<>(); if (cfg.hasPath("order")) { switch (cfg.getString("order").toLowerCase()) { case "random": order = new RandomOrder<>(); break; case "timestamp": order = new TimestampOrder<>(); break; default: throw new SpecificationException( "invalid order " + cfg.getString("order") + " for crossfold"); } } return order; }
private static Map<Object, Object> config( final Env env, final Config config, final List<Class<?>> classes) { Map<Object, Object> $ = new HashMap<>(); config .getConfig("hibernate") .entrySet() .forEach(e -> $.put("hibernate." + e.getKey(), e.getValue().unwrapped())); if (classes.size() > 0) { $.put(AvailableSettings.LOADED_CLASSES, classes); } if (!config.hasPath("hibernate.hbm2ddl.auto")) { String hbm2ddl = env.name().equals("dev") ? "update" : "validate"; $.put("hibernate.hbm2ddl.auto", hbm2ddl); } return $; }
@Override public void start( Application<StormEnvironment, StormTopology> executor, com.typesafe.config.Config config) { String topologyName = config.getString("appId"); Preconditions.checkNotNull( topologyName, "[appId] is required by null for " + executor.getClass().getCanonicalName()); StormTopology topology = executor.execute(config, environment); LOG.info( "Starting {} ({}), mode: {}", topologyName, executor.getClass().getCanonicalName(), config.getString("mode")); Config conf = getStormConfig(config); if (ApplicationEntity.Mode.CLUSTER.name().equalsIgnoreCase(config.getString("mode"))) { String jarFile = config.hasPath("jarPath") ? config.getString("jarPath") : null; if (jarFile == null) { jarFile = DynamicJarPathFinder.findPath(executor.getClass()); } synchronized (StormExecutionRuntime.class) { System.setProperty("storm.jar", jarFile); LOG.info("Submitting as cluster mode ..."); try { StormSubmitter.submitTopologyWithProgressBar(topologyName, conf, topology); } catch (AlreadyAliveException | InvalidTopologyException e) { LOG.error(e.getMessage(), e); throw new RuntimeException(e.getMessage(), e); } finally { System.clearProperty("storm.jar"); } } } else { LOG.info("Submitting as local mode ..."); getLocalCluster().submitTopology(topologyName, conf, topology); LOG.info("Submitted"); } LOG.info("Started {} ({})", topologyName, executor.getClass().getCanonicalName()); }
@Nonnull private PartitionAlgorithm<Rating> getRatingPartitionAlgorithm(Config cfg) { PartitionAlgorithm<Rating> partition = new HoldoutNPartition<>(10); if (cfg.hasPath("holdout")) { partition = new HoldoutNPartition<>(cfg.getInt("holdout")); if (cfg.hasPath("holdoutFraction")) { logger.warn("holdout and holdoutFraction specified, using holdout"); } if (cfg.hasPath("retain")) { logger.warn("holdout and retain specified, using holdout"); } } else if (cfg.hasPath("holdoutFraction")) { partition = new FractionPartition<>(cfg.getDouble("holdoutFraction")); if (cfg.hasPath("retain")) { logger.warn("holdoutFraction and retain specified, using holdout"); } } else if (cfg.hasPath("retain")) { partition = new RetainNPartition<>(cfg.getInt("retain")); } return partition; }
@Override public boolean isSet(String propertyName) { return config.hasPath(propertyName); }
@Override public Crossfolder buildFromSpec(SpecificationContext context, Config cfg) throws SpecificationException { Crossfolder cf = new Crossfolder(); if (cfg.hasPath("name")) { cf.setName(cfg.getString("name")); } cf.setSource(context.build(DataSource.class, cfg.getConfig("input"))); if (cfg.hasPath("partitions")) { cf.setPartitionCount(cfg.getInt("partitions")); } String method = cfg.hasPath("method") ? cfg.getString("method") : "partition-users"; switch (method) { case "partition-users": { PartitionAlgorithm<Rating> partition = getRatingPartitionAlgorithm(cfg); Order<Rating> order = getRatingOrder(cfg); cf.setMethod(CrossfoldMethods.partitionUsers(order, partition)); break; } case "sample-users": { PartitionAlgorithm<Rating> partition = getRatingPartitionAlgorithm(cfg); Order<Rating> order = getRatingOrder(cfg); int sampleSize = cfg.hasPath("sampleSize") ? cfg.getInt("sampleSize") : 1000; cf.setMethod(CrossfoldMethods.sampleUsers(order, partition, sampleSize)); break; } case "partition-ratings": cf.setMethod(CrossfoldMethods.partitionRatings()); break; default: throw new SpecificationException("invalid crossfold method " + method); } if (cfg.hasPath("includeTimestamps")) { cf.setWriteTimestamps(cfg.getBoolean("includeTimestamps")); } if (cfg.hasPath("outputDir")) { cf.setOutputDir(cfg.getString("outputDir")); } else { logger.warn("no output directory specified for crossfold {}", cf.getName()); } if (cfg.hasPath("outputFormat")) { switch (cfg.getString("outputFormat")) { case "pack": cf.setOutputFormat(OutputFormat.PACK); break; case "gzip": cf.setOutputFormat(OutputFormat.CSV_GZIP); break; case "xz": cf.setOutputFormat(OutputFormat.CSV_GZIP); break; default: throw new SpecificationException( "invalid output format " + cfg.getString("outputFormat")); } } if (cfg.hasPath("isolate")) { cf.setIsolate(cfg.getBoolean("isolate")); } return cf; }
public static InboundSettings create(Config config) { Config inbound = config.getConfig("inbound"); List<String> columnNames; if (inbound.hasPath("column-names")) { columnNames = inbound.getStringList("column-names"); } else { int numColumns = inbound.getInt("num-columns"); columnNames = new ArrayList<>(numColumns); for (int i = 0; i < numColumns; i++) { columnNames.add(String.valueOf(i)); } } Function<Object, Integer> lookup = new LookupFunction(columnNames); Collection<Integer> allColumns = Collections2.transform(columnNames, lookup); Collection<Integer> idColumns; if (inbound.hasPath("id-columns")) { idColumns = ImmutableSet.copyOf(Collections2.transform(inbound.getAnyRefList("id-columns"), lookup)); } else { idColumns = ImmutableSet.of(); } Collection<Integer> ignoredColumns; if (inbound.hasPath("ignored-columns")) { ignoredColumns = ImmutableSet.copyOf( Collections2.transform(inbound.getAnyRefList("ignored-columns"), lookup)); } else { ignoredColumns = ImmutableSet.of(); } Collection<Integer> categoricalColumns; Collection<Integer> numericColumns; if (inbound.hasPath("categorical-columns")) { Preconditions.checkState(!inbound.hasPath("numeric-columns")); categoricalColumns = new HashSet<>( Collections2.transform(inbound.getAnyRefList("categorical-columns"), lookup)); numericColumns = new HashSet<>(allColumns); numericColumns.removeAll(categoricalColumns); } else if (inbound.hasPath("numeric-columns")) { Preconditions.checkState(!inbound.hasPath("categorical-columns")); numericColumns = new HashSet<>(Collections2.transform(inbound.getAnyRefList("numeric-columns"), lookup)); categoricalColumns = new HashSet<>(allColumns); categoricalColumns.removeAll(numericColumns); } else { throw new IllegalArgumentException("No categorical-columns or numeric-columns set"); } numericColumns.removeAll(idColumns); numericColumns.removeAll(ignoredColumns); categoricalColumns.removeAll(idColumns); categoricalColumns.removeAll(ignoredColumns); Integer targetColumn = null; if (inbound.hasPath("target-column")) { targetColumn = lookup.apply(inbound.getAnyRef("target-column")); Preconditions.checkState( categoricalColumns.contains(targetColumn) || numericColumns.contains(targetColumn), "Target column not specified as numeric or categorical"); } return new InboundSettings( columnNames, idColumns, categoricalColumns, numericColumns, ignoredColumns, targetColumn); }
public SolrCell( CommandBuilder builder, Config config, Command parent, Command child, MorphlineContext context) { super(builder, config, parent, child, context); Config solrLocatorConfig = getConfigs().getConfig(config, "solrLocator"); SolrLocator locator = new SolrLocator(solrLocatorConfig, context); LOG.debug("solrLocator: {}", locator); this.schema = locator.getIndexSchema(); Preconditions.checkNotNull(schema); LOG.trace( "Solr schema: \n{}", Joiner.on("\n").join(new TreeMap(schema.getFields()).values())); ListMultimap<String, String> cellParams = ArrayListMultimap.create(); String uprefix = getConfigs().getString(config, ExtractingParams.UNKNOWN_FIELD_PREFIX, null); if (uprefix != null) { cellParams.put(ExtractingParams.UNKNOWN_FIELD_PREFIX, uprefix); } for (String capture : getConfigs() .getStringList( config, ExtractingParams.CAPTURE_ELEMENTS, Collections.<String>emptyList())) { cellParams.put(ExtractingParams.CAPTURE_ELEMENTS, capture); } Config fmapConfig = getConfigs().getConfig(config, "fmap", null); if (fmapConfig != null) { for (Map.Entry<String, Object> entry : new Configs().getEntrySet(fmapConfig)) { cellParams.put(ExtractingParams.MAP_PREFIX + entry.getKey(), entry.getValue().toString()); } } String captureAttributes = getConfigs().getString(config, ExtractingParams.CAPTURE_ATTRIBUTES, null); if (captureAttributes != null) { cellParams.put(ExtractingParams.CAPTURE_ATTRIBUTES, captureAttributes); } String lowerNames = getConfigs().getString(config, ExtractingParams.LOWERNAMES, null); if (lowerNames != null) { cellParams.put(ExtractingParams.LOWERNAMES, lowerNames); } String defaultField = getConfigs().getString(config, ExtractingParams.DEFAULT_FIELD, null); if (defaultField != null) { cellParams.put(ExtractingParams.DEFAULT_FIELD, defaultField); } xpathExpr = getConfigs().getString(config, ExtractingParams.XPATH_EXPRESSION, null); if (xpathExpr != null) { cellParams.put(ExtractingParams.XPATH_EXPRESSION, xpathExpr); } this.dateFormats = getConfigs() .getStringList(config, "dateFormats", new ArrayList<>(DateUtil.DEFAULT_DATE_FORMATS)); String handlerStr = getConfigs() .getString( config, "solrContentHandlerFactory", TrimSolrContentHandlerFactory.class.getName()); Class<? extends SolrContentHandlerFactory> factoryClass; try { factoryClass = (Class<? extends SolrContentHandlerFactory>) Class.forName(handlerStr); } catch (ClassNotFoundException cnfe) { throw new MorphlineCompilationException( "Could not find class " + handlerStr + " to use for " + "solrContentHandlerFactory", config, cnfe); } this.solrContentHandlerFactory = getSolrContentHandlerFactory(factoryClass, dateFormats, config); this.locale = getLocale(getConfigs().getString(config, "locale", "")); this.mediaTypeToParserMap = new HashMap<>(); // MimeTypes mimeTypes = MimeTypes.getDefaultMimeTypes(); // FIXME // getMediaTypeRegistry.normalize() List<? extends Config> parserConfigs = getConfigs().getConfigList(config, "parsers"); for (Config parserConfig : parserConfigs) { String parserClassName = getConfigs().getString(parserConfig, "parser"); Object obj; try { obj = Class.forName(parserClassName).newInstance(); } catch (Throwable e) { throw new MorphlineCompilationException( "Cannot instantiate Tika parser: " + parserClassName, config, e); } if (!(obj instanceof Parser)) { throw new MorphlineCompilationException( "Tika parser " + obj.getClass().getName() + " must be an instance of class " + Parser.class.getName(), config); } Parser parser = (Parser) obj; this.parsers.add(parser); List<String> mediaTypes = getConfigs() .getStringList(parserConfig, SUPPORTED_MIME_TYPES, Collections.<String>emptyList()); for (String mediaTypeStr : mediaTypes) { MediaType mediaType = parseMediaType(mediaTypeStr); addSupportedMimeType(mediaTypeStr); this.mediaTypeToParserMap.put(mediaType, parser); } if (!parserConfig.hasPath(SUPPORTED_MIME_TYPES)) { for (MediaType mediaType : parser.getSupportedTypes(new ParseContext())) { mediaType = mediaType.getBaseType(); addSupportedMimeType(mediaType.toString()); this.mediaTypeToParserMap.put(mediaType, parser); } List<String> extras = getConfigs() .getStringList( parserConfig, ADDITIONAL_SUPPORTED_MIME_TYPES, Collections.<String>emptyList()); for (String mediaTypeStr : extras) { MediaType mediaType = parseMediaType(mediaTypeStr); addSupportedMimeType(mediaTypeStr); this.mediaTypeToParserMap.put(mediaType, parser); } } } // LOG.info("mediaTypeToParserMap="+mediaTypeToParserMap); Map<String, String[]> tmp = new HashMap(); for (Map.Entry<String, Collection<String>> entry : cellParams.asMap().entrySet()) { tmp.put(entry.getKey(), entry.getValue().toArray(new String[entry.getValue().size()])); } this.solrParams = new MultiMapSolrParams(tmp); validateArguments(); }
@Override public ActorSystem buildActorSystem(Config config) throws Exception { // start redis final int redisPort = (config.hasPath("components.redis.port")) ? config.getInt("components.redis.port") : RedisURI.DEFAULT_REDIS_PORT; final String redisLogLevel = config.hasPath("components.redis.log-level") ? config.getString("components.redis.log-level") : "verbose"; String logBase = System.getenv("LOG_BASE"); if (StringUtils.isBlank(logBase)) logBase = System.getenv("TEMP"); final String redisLogFile = config.hasPath("components.redis.log-file") ? config.getString("components.redis.log-file") : logBase + "\\redis.log"; final String redisPidFile = config.hasPath("components.redis.pid-file") ? config.getString("components.redis.pid-file") : logBase + "\\redis.pid"; try { this.redis = RedisServer.builder() .redisExecProvider(RedisExecProvider.defaultProvider()) .port(redisPort) .setting("loglevel " + redisLogLevel) .setting("logfile " + redisLogFile) .setting("pidfile " + redisPidFile) .build(); } catch (Exception ex) { this.logger.error("Fail to build redis server.", ex); throw new IllegalStateException("Fail to build redis server.", ex); } new Thread() { @Override public void run() { try { redis.start(); logger.info("Started redis server on {} port", redisPort); } catch (Exception ex) { logger.error("Fail to start redis server.", ex); // @TODO Use future to stop the actor system at this point. } } }.start(); // create redis client String redisUri = "redis://" + this.getAddress().getHostAddress() + ":" + redisPort + "/0"; this.redisClient = new RedisClient(RedisURI.create(redisUri)); ActorSystem system = ActorSystem.create(this.getClusterName(), config); Camel camel = CamelExtension.get(system); this.baseUrl = "http://" + this.getAddress().getHostAddress() + ":" + this.getHttpPort() + "/" + this.getApplicationName(); String uri = "jetty:" + this.baseUrl; String recorderKeyBase = this.getClusterName() + ":" + "words"; ActorRef recordingService = system.actorOf( Props.create(RecordingService.class, recorderKeyBase, this.redisClient), "recorderService"); String tracerKey = this.getClusterName() + ":trace:node:1"; ActorRef traceLogService = system.actorOf( Props.create(TraceLogService.class, tracerKey, this.redisClient, this.jacksonMapper), "traceLogService"); ActorRef analysisService = system.actorOf( Props.create(AnalysisService.class, recordingService, traceLogService), "analysisService"); String pathBase = "akka.tcp://" + this.getClusterName() + "@" + this.getAddress().getHostAddress() + ":"; SimpleRoutingMap<String> routingMap = new SimpleRoutingMap<String>(); routingMap.putPath(new Key<String>("2551"), pathBase + "2551/user/analysisService"); routingMap.putPath(new Key<String>("2552"), pathBase + "2552/user/analysisService"); ActorRef httpClerk = system.actorOf(Props.create(WebService.class, uri, routingMap), "httpClerk"); Future<ActorRef> activationFuture = camel.activationFutureFor( httpClerk, new Timeout(Duration.create(10, TimeUnit.SECONDS)), system.dispatcher()); return system; }
private void initWord2Vec(String name) throws ConfigurationException, IOException, DaoException { Config config = getMetricConfig(name).getConfig("generator"); File model = Word2VecGenerator.getModelFile(config.getString("modelDir"), language); if (skipBuiltMetrics && model.isFile()) { return; } if (config.hasPath("prebuilt") && config.getBoolean("prebuilt")) { if (model.isFile()) { return; } File downloadPath = new File(config.getString("binfile")); if (!downloadPath.isFile()) { throw new ConfigurationException( "word2vec model " + downloadPath.getAbsolutePath() + " cannot be found." + " You must download it from " + config.getString("url") + " into to the wikibrain download directory."); } if (!config.getStringList("languages").contains(language.getLangCode())) { throw new ConfigurationException( "word2vec model " + downloadPath + " does not support language" + language); } if (downloadPath.toString().toLowerCase().endsWith("gz")) { LOG.info("decompressing " + downloadPath + " to " + model); File tmp = File.createTempFile("word2vec", "bin"); try { FileUtils.deleteQuietly(tmp); GZIPInputStream gz = new GZIPInputStream(new FileInputStream(downloadPath)); FileUtils.copyInputStreamToFile(gz, tmp); gz.close(); model.getParentFile().mkdirs(); FileUtils.moveFile(tmp, model); } finally { FileUtils.deleteQuietly(tmp); } } else { FileUtils.copyFile(downloadPath, model); } return; } LinkProbabilityDao lpd = env.getConfigurator().get(LinkProbabilityDao.class); lpd.useCache(true); if (!lpd.isBuilt()) { lpd.build(); } String corpusName = config.getString("corpus"); Corpus corpus = null; if (!corpusName.equals("NONE")) { corpus = env.getConfigurator() .get(Corpus.class, config.getString("corpus"), "language", language.getLangCode()); if (!corpus.exists()) { corpus.create(); } } if (model.isFile() && (corpus == null || model.lastModified() > corpus.getCorpusFile().lastModified())) { return; } if (corpus == null) { throw new ConfigurationException( "word2vec metric " + name + " cannot build or find model!" + "configuration has no corpus, but model not found at " + model + "."); } Word2VecTrainer trainer = new Word2VecTrainer(env.getConfigurator().get(LocalPageDao.class), language); if (config.hasPath("dimensions")) { LOG.info("set number of dimensions to " + config.getInt("dimensions")); trainer.setLayer1Size(config.getInt("dimensions")); } if (config.hasPath("maxWords")) { LOG.info("set maxWords to " + config.getInt("maxWords")); trainer.setMaxWords(config.getInt("maxWords")); } if (config.hasPath("window")) { LOG.info("set window to " + config.getInt("maxWords")); trainer.setWindow(config.getInt("window")); } trainer.setKeepAllArticles(true); trainer.train(corpus.getDirectory()); trainer.save(model); }