public DateTimeDatasetVersionFinder(FileSystem fs, Config config) {
    super(fs);
    Preconditions.checkArgument(
        config.hasPath(DATE_TIME_PATTERN_KEY),
        "Missing required property " + DATE_TIME_PATTERN_KEY);
    String pattern = config.getString(DATE_TIME_PATTERN_KEY);

    if (config.hasPath(OPTIONAL_GLOB_PATTERN_TIMEZONE_KEY)) {
      this.globPattern = new Path(config.getString(OPTIONAL_GLOB_PATTERN_TIMEZONE_KEY));
    } else {
      this.globPattern = new Path(pattern.replaceAll("[^/]+", "*"));
    }

    LOGGER.debug(
        String.format(
            "Setting timezone for patthern: %s. By default it is %s",
            pattern, DEFAULT_DATE_TIME_PATTERN_TIMEZONE));

    if (config.hasPath(DATE_TIME_PATTERN_TIMEZONE_KEY)) {
      this.formatter =
          DateTimeFormat.forPattern(pattern)
              .withZone(DateTimeZone.forID(config.getString(DATE_TIME_PATTERN_TIMEZONE_KEY)));
    } else {
      this.formatter =
          DateTimeFormat.forPattern(pattern)
              .withZone(DateTimeZone.forID(DEFAULT_DATE_TIME_PATTERN_TIMEZONE));
    }

    this.datePartitionPattern = pattern;
  }
  /**
   *
   *
   * <ul>
   *   <li>The constructor takes in a dataset {@link Config} which MUST have a comma separated list
   *       of destination formats at key, {@value #DESTINATION_CONVERSION_FORMATS_KEY}
   *   <li>Conversion configuration for a format can be set by using destination format as prefix.
   *   <li>E.g. If {@value #DESTINATION_CONVERSION_FORMATS_KEY}=flattenedOrc,nestedOrc.<br>
   *       The destination table name for flattened ORC is set at flattenedOrc.tableName<br>
   *       And the destination table name for nested ORC is set at nestedOrc.tableName
   * </ul>
   *
   * @param fs
   * @param clientPool
   * @param table
   * @param config
   */
  public ConvertibleHiveDataset(
      FileSystem fs, HiveMetastoreClientPool clientPool, Table table, Config config) {
    super(fs, clientPool, table, config);

    Preconditions.checkArgument(
        config.hasPath(DESTINATION_CONVERSION_FORMATS_KEY),
        String.format(
            "Atleast one destination format should be specified at %s.%s. If you do not intend to convert this dataset set %s.%s to true",
            super.properties.getProperty(HiveDatasetFinder.HIVE_DATASET_CONFIG_PREFIX_KEY, ""),
            DESTINATION_CONVERSION_FORMATS_KEY,
            super.properties.getProperty(HiveDatasetFinder.HIVE_DATASET_CONFIG_PREFIX_KEY, ""),
            HiveDatasetFinder.HIVE_DATASET_IS_BLACKLISTED_KEY));

    // value for DESTINATION_CONVERSION_FORMATS_KEY can be a TypeSafe list or a comma separated list
    // of string
    this.destFormats =
        Sets.newHashSet(ConfigUtils.getStringList(config, DESTINATION_CONVERSION_FORMATS_KEY));

    // For each format create ConversionConfig and store it in a Map<format,conversionConfig>
    this.destConversionConfigs = Maps.newHashMap();

    for (String format : this.destFormats) {
      if (config.hasPath(format)) {
        this.destConversionConfigs.put(
            format, new ConversionConfig(config.getConfig(format), table, format));
      }
    }
  }
  private backtype.storm.Config getStormConfig(com.typesafe.config.Config config) {
    backtype.storm.Config conf = new backtype.storm.Config();
    conf.put(RichSpoutBatchExecutor.MAX_BATCH_SIZE_CONF, Int.box(64 * 1024));
    conf.put(backtype.storm.Config.TOPOLOGY_RECEIVER_BUFFER_SIZE, Int.box(8));
    conf.put(backtype.storm.Config.TOPOLOGY_TRANSFER_BUFFER_SIZE, Int.box(32));
    conf.put(backtype.storm.Config.TOPOLOGY_EXECUTOR_RECEIVE_BUFFER_SIZE, Int.box(16384));
    conf.put(backtype.storm.Config.TOPOLOGY_EXECUTOR_SEND_BUFFER_SIZE, Int.box(16384));
    conf.put(backtype.storm.Config.NIMBUS_THRIFT_MAX_BUFFER_SIZE, Int.box(20480000));
    String nimbusHost = STORM_NIMBUS_HOST_DEFAULT;
    if (environment.config().hasPath(STORM_NIMBUS_HOST_CONF_PATH)) {
      nimbusHost = environment.config().getString(STORM_NIMBUS_HOST_CONF_PATH);
      LOG.info("Overriding {} = {}", STORM_NIMBUS_HOST_CONF_PATH, nimbusHost);
    } else {
      LOG.info("Using default {} = {}", STORM_NIMBUS_HOST_CONF_PATH, STORM_NIMBUS_HOST_DEFAULT);
    }
    Integer nimbusThriftPort = STORM_NIMBUS_THRIFT_DEFAULT;
    if (environment.config().hasPath(STORM_NIMBUS_THRIFT_CONF_PATH)) {
      nimbusThriftPort = environment.config().getInt(STORM_NIMBUS_THRIFT_CONF_PATH);
      LOG.info("Overriding {} = {}", STORM_NIMBUS_THRIFT_CONF_PATH, nimbusThriftPort);
    } else {
      LOG.info("Using default {} = {}", STORM_NIMBUS_THRIFT_CONF_PATH, STORM_NIMBUS_THRIFT_DEFAULT);
    }
    conf.put(backtype.storm.Config.NIMBUS_HOST, nimbusHost);
    conf.put(backtype.storm.Config.NIMBUS_THRIFT_PORT, nimbusThriftPort);
    conf.put(
        Config.STORM_THRIFT_TRANSPORT_PLUGIN, "backtype.storm.security.auth.SimpleTransportPlugin");
    if (config.hasPath(WORKERS)) {
      conf.setNumWorkers(config.getInt(WORKERS));
    }

    if (config.hasPath(TOPOLOGY_MESSAGE_TIMEOUT_SECS)) {
      conf.put(TOPOLOGY_MESSAGE_TIMEOUT_SECS, config.getInt(TOPOLOGY_MESSAGE_TIMEOUT_SECS));
    }
    return conf;
  }
    private ConversionConfig(Config config, Table table, String destinationFormat) {

      Preconditions.checkArgument(
          config.hasPath(DESTINATION_TABLE_KEY),
          String.format("Key %s.%s is not specified", destinationFormat, DESTINATION_TABLE_KEY));
      Preconditions.checkArgument(
          config.hasPath(DESTINATION_DB_KEY),
          String.format("Key %s.%s is not specified", destinationFormat, DESTINATION_DB_KEY));
      Preconditions.checkArgument(
          config.hasPath(DESTINATION_DATA_PATH_KEY),
          String.format(
              "Key %s.%s is not specified", destinationFormat, DESTINATION_DATA_PATH_KEY));

      // Required
      this.destinationFormat = destinationFormat;
      this.destinationTableName = resolveTemplate(config.getString(DESTINATION_TABLE_KEY), table);
      this.destinationStagingTableName =
          String.format(
              "%s_%s", this.destinationTableName, "staging"); // Fixed and non-configurable
      this.destinationDbName = resolveTemplate(config.getString(DESTINATION_DB_KEY), table);
      this.destinationDataPath =
          resolveTemplate(config.getString(DESTINATION_DATA_PATH_KEY), table);

      // Optional
      this.clusterBy = ConfigUtils.getStringList(config, CLUSTER_BY_KEY);
      this.numBuckets = Optional.fromNullable(ConfigUtils.getInt(config, NUM_BUCKETS_KEY, null));
      this.hiveRuntimeProperties =
          ConfigUtils.configToProperties(
              ConfigUtils.getConfig(
                  config, HIVE_RUNTIME_PROPERTIES_KEY_PREFIX, ConfigFactory.empty()));
      this.evolutionEnabled = ConfigUtils.getBoolean(config, EVOLUTION_ENABLED, false);
      this.rowLimit = Optional.fromNullable(ConfigUtils.getInt(config, ROW_LIMIT_KEY, null));
      this.sourceDataPathIdentifier =
          ConfigUtils.getStringList(config, SOURCE_DATA_PATH_IDENTIFIER_KEY);
    }
Beispiel #5
0
  public void buildMetric(String name) throws ConfigurationException, DaoException, IOException {

    LOG.info("building component metric " + name);
    String type = getMetricType(name);
    if (type.equals("densevector.word2vec")) {
      initWord2Vec(name);
    }

    SRMetric metric = getMetric(name);
    if (type.equals("ensemble")) {
      ((EnsembleMetric) metric).setTrainSubmetrics(false); // Do it by hand
    } else if (type.equals("sparsevector.mostsimilarconcepts")) {
      if (mode == Mode.SIMILARITY) {
        LOG.warn("metric " + name + " of type " + type + " requires mostSimilar... training BOTH");
        mode = Mode.BOTH;
      }
      throw new UnsupportedOperationException("This block needs to occur earlier.");
    } else if (type.equals("milnewitten")) {
      ((MilneWittenMetric) metric).setTrainSubmetrics(false);
    }

    if (metric instanceof BaseSRMetric) {
      ((BaseSRMetric) metric).setBuildMostSimilarCache(buildCosimilarity);
    }

    Dataset ds = getDataset();
    if (mode == Mode.SIMILARITY || mode == Mode.BOTH) {
      if (skipBuiltMetrics && metric.similarityIsTrained()) {
        LOG.info("metric " + name + " similarity() is already trained... skipping");
      } else {
        metric.trainSimilarity(ds);
      }
    }

    if (mode == Mode.MOSTSIMILAR || mode == Mode.BOTH) {
      if (skipBuiltMetrics && metric.mostSimilarIsTrained()) {
        LOG.info("metric " + name + " mostSimilar() is already trained... skipping");
      } else {
        Config config = getMetricConfig(name);
        int n = maxResults * EnsembleMetric.SEARCH_MULTIPLIER;
        TIntSet validIds = validMostSimilarIds;
        if (config.hasPath("maxResults")) {
          n = config.getInt("maxResults");
        }
        if (config.hasPath("mostSimilarConcepts")) {
          String path =
              String.format(
                  "%s/%s.txt",
                  config.getString("mostSimilarConcepts"), metric.getLanguage().getLangCode());
          validIds = readIds(path);
        }
        metric.trainMostSimilar(ds, n, validIds);
      }
    }
    metric.write();
  }
  public static void main(String[] args) {
    Config conf = ConfigFactory.load("test");
    if (conf.hasPath("greed")) System.out.println(conf.getConfig("greed").toString());
    System.out.println(conf.getConfig("greed.lang.cpp").resolve().toString());

    conf =
        ConfigFactory.parseFile(
                new File(System.getProperty("user.dir") + "/src/main/resources/default.conf"))
            .resolve();
    if (conf.hasPath("greed")) System.out.println(conf.getConfig("greed").toString());
  }
Beispiel #7
0
 public Sample(
     CommandBuilder builder,
     Config config,
     Command parent,
     Command child,
     MorphlineContext context) {
   super(builder, config, parent, child, context);
   this.probability = getConfigs().getDouble(config, "probability", 1.0);
   if (probability < 0.0) {
     throw new MorphlineCompilationException(
         "Probability must not be negative: " + probability, config);
   }
   if (probability >= 1.0) {
     this.prng = null;
   } else {
     if (config.hasPath("seed")) {
       long seed = getConfigs().getLong(config, "seed");
       this.prng = new Well19937c(seed); // non-secure & fast
     } else {
       Random rand = new SecureRandom();
       int[] seed = new int[624];
       for (int i = 0; i < seed.length; i++) {
         seed[i] = rand.nextInt();
       }
       this.prng = new Well19937c(seed); // non-secure & fast
     }
   }
   validateArguments();
 }
Beispiel #8
0
  /**
   * Returns a list of metric names (including the passed in name) that are a submetric of the
   * specified metric. The metrics are topologically sorted by dependency, so the parent metric will
   * appear last.
   *
   * @param parentName
   * @return
   * @throws ConfigurationException
   */
  public List<String> getSubmetrics(String parentName) throws ConfigurationException {
    String type = getMetricType(parentName);
    Config config = getMetricConfig(parentName);
    List<String> toAdd = new ArrayList<String>();
    if (type.equals("ensemble") || type.equals("simple-ensemble")) {
      for (String child : config.getStringList("metrics")) {
        toAdd.addAll(getSubmetrics(child));
        toAdd.add(child);
      }
    } else if (type.equals("sparsevector.mostsimilarconcepts")) {
      toAdd.addAll(getSubmetrics(config.getString("generator.basemetric")));
    } else if (type.equals("milnewitten")) {
      toAdd.add(config.getString("inlink"));
      toAdd.add(config.getString("outlink"));
    } else if (config.hasPath("reliesOn")) {
      toAdd.addAll(config.getStringList("reliesOn"));
    }
    toAdd.add(parentName);
    List<String> results = new ArrayList<String>();

    // Make sure things only appear once. We save the FIRST time they appear to preserve
    // dependencies.
    for (String name : toAdd) {
      if (!results.contains(name)) {
        results.add(name);
      }
    }
    return results;
  }
 public GlobModTimeDatasetVersionFinder(FileSystem fs, Config config) {
   this(
       fs,
       config.hasPath(VERSION_FINDER_GLOB_PATTERN_KEY)
           ? new Path(config.getString(VERSION_FINDER_GLOB_PATTERN_KEY))
           : new Path("*"));
 }
Beispiel #10
0
  public YarnService(
      Config config,
      String applicationName,
      String applicationId,
      YarnConfiguration yarnConfiguration,
      FileSystem fs,
      EventBus eventBus)
      throws Exception {
    this.applicationName = applicationName;
    this.applicationId = applicationId;

    this.config = config;

    this.eventBus = eventBus;

    this.gobblinMetrics =
        config.getBoolean(ConfigurationKeys.METRICS_ENABLED_KEY)
            ? Optional.of(buildGobblinMetrics())
            : Optional.<GobblinMetrics>absent();

    this.eventSubmitter =
        config.getBoolean(ConfigurationKeys.METRICS_ENABLED_KEY)
            ? Optional.of(buildEventSubmitter())
            : Optional.<EventSubmitter>absent();

    this.yarnConfiguration = yarnConfiguration;
    this.fs = fs;

    this.amrmClientAsync =
        closer.register(
            AMRMClientAsync.createAMRMClientAsync(1000, new AMRMClientCallbackHandler()));
    this.amrmClientAsync.init(this.yarnConfiguration);
    this.nmClientAsync =
        closer.register(NMClientAsync.createNMClientAsync(new NMClientCallbackHandler()));
    this.nmClientAsync.init(this.yarnConfiguration);

    this.initialContainers = config.getInt(GobblinYarnConfigurationKeys.INITIAL_CONTAINERS_KEY);
    this.requestedContainerMemoryMbs =
        config.getInt(GobblinYarnConfigurationKeys.CONTAINER_MEMORY_MBS_KEY);
    this.requestedContainerCores = config.getInt(GobblinYarnConfigurationKeys.CONTAINER_CORES_KEY);
    this.containerHostAffinityEnabled =
        config.getBoolean(GobblinYarnConfigurationKeys.CONTAINER_HOST_AFFINITY_ENABLED);

    this.helixInstanceMaxRetries =
        config.getInt(GobblinYarnConfigurationKeys.HELIX_INSTANCE_MAX_RETRIES);

    this.containerJvmArgs =
        config.hasPath(GobblinYarnConfigurationKeys.CONTAINER_JVM_ARGS_KEY)
            ? Optional.of(config.getString(GobblinYarnConfigurationKeys.CONTAINER_JVM_ARGS_KEY))
            : Optional.<String>absent();

    this.containerLaunchExecutor =
        Executors.newFixedThreadPool(
            10,
            ExecutorsUtils.newThreadFactory(
                Optional.of(LOGGER), Optional.of("ContainerLaunchExecutor")));

    this.tokens = getSecurityTokens();
  }
 public int getCpuSetSize() {
   if (vampires.hasPath("cpuSetSize")) {
     return vampires.getInt("cpuSetSize");
   } else {
     LOG.error("missing executor cpuSetSize");
   }
   return 1;
 }
 public CandidateFilterFactory() {
   Config config = ConfigUtils.getDefaultConfig();
   lshSampleRatio = config.getDouble("model.lsh.sample-ratio");
   numHashes = config.getInt("model.lsh.num-hashes");
   candidateFilterClassName =
       config.hasPath("serving-layer.candidate-filter-class")
           ? config.getString("serving-layer.candidate-filter-class")
           : null;
 }
Beispiel #13
0
 @Override
 public void configure(Config config, String key) {
   fieldName1 = config.getString(key + ".field1");
   fieldName2 = config.getString(key + ".field2");
   if (config.hasPath(key + ".keys")) {
     keys = config.getStringList(key + ".keys");
   }
   key2 = config.getString(key + ".key2");
   constant = config.getDouble((key + ".constant"));
   outputName = config.getString(key + ".output");
 }
 public List<String> getExecutors() {
   if (vampires.hasPath("executors")) {
     return vampires
         .getStringList("executors")
         .stream()
         .map(String::toUpperCase)
         .collect(Collectors.toList());
   } else {
     LOG.error("missing executors config value");
     throw new IllegalArgumentException("missing executors config value");
   }
 }
Beispiel #15
0
  @Test
  public void testPropertiesToConfigWithPrefix() {

    Properties properties = new Properties();
    properties.setProperty("k1.kk1", "v1");
    properties.setProperty("k1.kk2", "v2");
    properties.setProperty("k2.kk", "v3");

    Config conf = ConfigUtils.propertiesToConfig(properties, Optional.of("k1"));
    Assert.assertEquals(conf.getString("k1.kk1"), "v1");
    Assert.assertEquals(conf.getString("k1.kk2"), "v2");
    Assert.assertFalse(conf.hasPath("k2.kk"), "Should not contain key k2.kk");
  }
 private void cacheManagerPeerListenerFactory(final Config conf) {
   if (conf.hasPath("class")) {
     eh.addCacheManagerPeerListenerFactory(
         newFactory("ehcache.cacheManagerPeerListenerFactory", conf, FactoryConfiguration::new));
   } else {
     each(
         conf,
         (name, c) -> {
           eh.addCacheManagerPeerListenerFactory(
               newFactory(
                   "ehcache.cacheManagerPeerListenerFactory." + name,
                   c,
                   FactoryConfiguration::new));
         });
   }
 }
Beispiel #17
0
 @Override
 public void configure(final Env env, final Config config, final Binder binder) {
   if (config.hasPath("server.module")) {
     try {
       delegate =
           (Jooby.Module)
               getClass()
                   .getClassLoader()
                   .loadClass(config.getString("server.module"))
                   .newInstance();
       delegate.configure(env, config, binder);
     } catch (Exception ex) {
       throw new IllegalStateException(
           "No " + Server.class.getName() + " implementation was found.", ex);
     }
   }
 }
 @Nonnull
 private Order<Rating> getRatingOrder(Config cfg) throws SpecificationException {
   Order<Rating> order = new RandomOrder<>();
   if (cfg.hasPath("order")) {
     switch (cfg.getString("order").toLowerCase()) {
       case "random":
         order = new RandomOrder<>();
         break;
       case "timestamp":
         order = new TimestampOrder<>();
         break;
       default:
         throw new SpecificationException(
             "invalid order " + cfg.getString("order") + " for crossfold");
     }
   }
   return order;
 }
Beispiel #19
0
  private static Map<Object, Object> config(
      final Env env, final Config config, final List<Class<?>> classes) {
    Map<Object, Object> $ = new HashMap<>();
    config
        .getConfig("hibernate")
        .entrySet()
        .forEach(e -> $.put("hibernate." + e.getKey(), e.getValue().unwrapped()));

    if (classes.size() > 0) {
      $.put(AvailableSettings.LOADED_CLASSES, classes);
    }

    if (!config.hasPath("hibernate.hbm2ddl.auto")) {
      String hbm2ddl = env.name().equals("dev") ? "update" : "validate";
      $.put("hibernate.hbm2ddl.auto", hbm2ddl);
    }

    return $;
  }
 @Override
 public void start(
     Application<StormEnvironment, StormTopology> executor, com.typesafe.config.Config config) {
   String topologyName = config.getString("appId");
   Preconditions.checkNotNull(
       topologyName, "[appId] is required by null for " + executor.getClass().getCanonicalName());
   StormTopology topology = executor.execute(config, environment);
   LOG.info(
       "Starting {} ({}), mode: {}",
       topologyName,
       executor.getClass().getCanonicalName(),
       config.getString("mode"));
   Config conf = getStormConfig(config);
   if (ApplicationEntity.Mode.CLUSTER.name().equalsIgnoreCase(config.getString("mode"))) {
     String jarFile = config.hasPath("jarPath") ? config.getString("jarPath") : null;
     if (jarFile == null) {
       jarFile = DynamicJarPathFinder.findPath(executor.getClass());
     }
     synchronized (StormExecutionRuntime.class) {
       System.setProperty("storm.jar", jarFile);
       LOG.info("Submitting as cluster mode ...");
       try {
         StormSubmitter.submitTopologyWithProgressBar(topologyName, conf, topology);
       } catch (AlreadyAliveException | InvalidTopologyException e) {
         LOG.error(e.getMessage(), e);
         throw new RuntimeException(e.getMessage(), e);
       } finally {
         System.clearProperty("storm.jar");
       }
     }
   } else {
     LOG.info("Submitting as local mode ...");
     getLocalCluster().submitTopology(topologyName, conf, topology);
     LOG.info("Submitted");
   }
   LOG.info("Started {} ({})", topologyName, executor.getClass().getCanonicalName());
 }
 @Nonnull
 private PartitionAlgorithm<Rating> getRatingPartitionAlgorithm(Config cfg) {
   PartitionAlgorithm<Rating> partition = new HoldoutNPartition<>(10);
   if (cfg.hasPath("holdout")) {
     partition = new HoldoutNPartition<>(cfg.getInt("holdout"));
     if (cfg.hasPath("holdoutFraction")) {
       logger.warn("holdout and holdoutFraction specified, using holdout");
     }
     if (cfg.hasPath("retain")) {
       logger.warn("holdout and retain specified, using holdout");
     }
   } else if (cfg.hasPath("holdoutFraction")) {
     partition = new FractionPartition<>(cfg.getDouble("holdoutFraction"));
     if (cfg.hasPath("retain")) {
       logger.warn("holdoutFraction and retain specified, using holdout");
     }
   } else if (cfg.hasPath("retain")) {
     partition = new RetainNPartition<>(cfg.getInt("retain"));
   }
   return partition;
 }
 @Override
 public boolean isSet(String propertyName) {
   return config.hasPath(propertyName);
 }
  @Override
  public Crossfolder buildFromSpec(SpecificationContext context, Config cfg)
      throws SpecificationException {
    Crossfolder cf = new Crossfolder();
    if (cfg.hasPath("name")) {
      cf.setName(cfg.getString("name"));
    }
    cf.setSource(context.build(DataSource.class, cfg.getConfig("input")));
    if (cfg.hasPath("partitions")) {
      cf.setPartitionCount(cfg.getInt("partitions"));
    }

    String method = cfg.hasPath("method") ? cfg.getString("method") : "partition-users";
    switch (method) {
      case "partition-users":
        {
          PartitionAlgorithm<Rating> partition = getRatingPartitionAlgorithm(cfg);
          Order<Rating> order = getRatingOrder(cfg);
          cf.setMethod(CrossfoldMethods.partitionUsers(order, partition));
          break;
        }
      case "sample-users":
        {
          PartitionAlgorithm<Rating> partition = getRatingPartitionAlgorithm(cfg);
          Order<Rating> order = getRatingOrder(cfg);
          int sampleSize = cfg.hasPath("sampleSize") ? cfg.getInt("sampleSize") : 1000;
          cf.setMethod(CrossfoldMethods.sampleUsers(order, partition, sampleSize));
          break;
        }
      case "partition-ratings":
        cf.setMethod(CrossfoldMethods.partitionRatings());
        break;
      default:
        throw new SpecificationException("invalid crossfold method " + method);
    }

    if (cfg.hasPath("includeTimestamps")) {
      cf.setWriteTimestamps(cfg.getBoolean("includeTimestamps"));
    }

    if (cfg.hasPath("outputDir")) {
      cf.setOutputDir(cfg.getString("outputDir"));
    } else {
      logger.warn("no output directory specified for crossfold {}", cf.getName());
    }

    if (cfg.hasPath("outputFormat")) {
      switch (cfg.getString("outputFormat")) {
        case "pack":
          cf.setOutputFormat(OutputFormat.PACK);
          break;
        case "gzip":
          cf.setOutputFormat(OutputFormat.CSV_GZIP);
          break;
        case "xz":
          cf.setOutputFormat(OutputFormat.CSV_GZIP);
          break;
        default:
          throw new SpecificationException(
              "invalid output format " + cfg.getString("outputFormat"));
      }
    }

    if (cfg.hasPath("isolate")) {
      cf.setIsolate(cfg.getBoolean("isolate"));
    }

    return cf;
  }
Beispiel #24
0
  public static InboundSettings create(Config config) {
    Config inbound = config.getConfig("inbound");

    List<String> columnNames;
    if (inbound.hasPath("column-names")) {
      columnNames = inbound.getStringList("column-names");
    } else {
      int numColumns = inbound.getInt("num-columns");
      columnNames = new ArrayList<>(numColumns);
      for (int i = 0; i < numColumns; i++) {
        columnNames.add(String.valueOf(i));
      }
    }

    Function<Object, Integer> lookup = new LookupFunction(columnNames);

    Collection<Integer> allColumns = Collections2.transform(columnNames, lookup);

    Collection<Integer> idColumns;
    if (inbound.hasPath("id-columns")) {
      idColumns =
          ImmutableSet.copyOf(Collections2.transform(inbound.getAnyRefList("id-columns"), lookup));
    } else {
      idColumns = ImmutableSet.of();
    }

    Collection<Integer> ignoredColumns;
    if (inbound.hasPath("ignored-columns")) {
      ignoredColumns =
          ImmutableSet.copyOf(
              Collections2.transform(inbound.getAnyRefList("ignored-columns"), lookup));
    } else {
      ignoredColumns = ImmutableSet.of();
    }

    Collection<Integer> categoricalColumns;
    Collection<Integer> numericColumns;
    if (inbound.hasPath("categorical-columns")) {
      Preconditions.checkState(!inbound.hasPath("numeric-columns"));
      categoricalColumns =
          new HashSet<>(
              Collections2.transform(inbound.getAnyRefList("categorical-columns"), lookup));
      numericColumns = new HashSet<>(allColumns);
      numericColumns.removeAll(categoricalColumns);
    } else if (inbound.hasPath("numeric-columns")) {
      Preconditions.checkState(!inbound.hasPath("categorical-columns"));
      numericColumns =
          new HashSet<>(Collections2.transform(inbound.getAnyRefList("numeric-columns"), lookup));
      categoricalColumns = new HashSet<>(allColumns);
      categoricalColumns.removeAll(numericColumns);
    } else {
      throw new IllegalArgumentException("No categorical-columns or numeric-columns set");
    }
    numericColumns.removeAll(idColumns);
    numericColumns.removeAll(ignoredColumns);
    categoricalColumns.removeAll(idColumns);
    categoricalColumns.removeAll(ignoredColumns);

    Integer targetColumn = null;
    if (inbound.hasPath("target-column")) {
      targetColumn = lookup.apply(inbound.getAnyRef("target-column"));
      Preconditions.checkState(
          categoricalColumns.contains(targetColumn) || numericColumns.contains(targetColumn),
          "Target column not specified as numeric or categorical");
    }

    return new InboundSettings(
        columnNames, idColumns, categoricalColumns, numericColumns, ignoredColumns, targetColumn);
  }
    public SolrCell(
        CommandBuilder builder,
        Config config,
        Command parent,
        Command child,
        MorphlineContext context) {
      super(builder, config, parent, child, context);

      Config solrLocatorConfig = getConfigs().getConfig(config, "solrLocator");
      SolrLocator locator = new SolrLocator(solrLocatorConfig, context);
      LOG.debug("solrLocator: {}", locator);
      this.schema = locator.getIndexSchema();
      Preconditions.checkNotNull(schema);
      LOG.trace(
          "Solr schema: \n{}", Joiner.on("\n").join(new TreeMap(schema.getFields()).values()));

      ListMultimap<String, String> cellParams = ArrayListMultimap.create();
      String uprefix = getConfigs().getString(config, ExtractingParams.UNKNOWN_FIELD_PREFIX, null);
      if (uprefix != null) {
        cellParams.put(ExtractingParams.UNKNOWN_FIELD_PREFIX, uprefix);
      }
      for (String capture :
          getConfigs()
              .getStringList(
                  config, ExtractingParams.CAPTURE_ELEMENTS, Collections.<String>emptyList())) {
        cellParams.put(ExtractingParams.CAPTURE_ELEMENTS, capture);
      }
      Config fmapConfig = getConfigs().getConfig(config, "fmap", null);
      if (fmapConfig != null) {
        for (Map.Entry<String, Object> entry : new Configs().getEntrySet(fmapConfig)) {
          cellParams.put(ExtractingParams.MAP_PREFIX + entry.getKey(), entry.getValue().toString());
        }
      }
      String captureAttributes =
          getConfigs().getString(config, ExtractingParams.CAPTURE_ATTRIBUTES, null);
      if (captureAttributes != null) {
        cellParams.put(ExtractingParams.CAPTURE_ATTRIBUTES, captureAttributes);
      }
      String lowerNames = getConfigs().getString(config, ExtractingParams.LOWERNAMES, null);
      if (lowerNames != null) {
        cellParams.put(ExtractingParams.LOWERNAMES, lowerNames);
      }
      String defaultField = getConfigs().getString(config, ExtractingParams.DEFAULT_FIELD, null);
      if (defaultField != null) {
        cellParams.put(ExtractingParams.DEFAULT_FIELD, defaultField);
      }
      xpathExpr = getConfigs().getString(config, ExtractingParams.XPATH_EXPRESSION, null);
      if (xpathExpr != null) {
        cellParams.put(ExtractingParams.XPATH_EXPRESSION, xpathExpr);
      }

      this.dateFormats =
          getConfigs()
              .getStringList(config, "dateFormats", new ArrayList<>(DateUtil.DEFAULT_DATE_FORMATS));

      String handlerStr =
          getConfigs()
              .getString(
                  config,
                  "solrContentHandlerFactory",
                  TrimSolrContentHandlerFactory.class.getName());
      Class<? extends SolrContentHandlerFactory> factoryClass;
      try {
        factoryClass = (Class<? extends SolrContentHandlerFactory>) Class.forName(handlerStr);
      } catch (ClassNotFoundException cnfe) {
        throw new MorphlineCompilationException(
            "Could not find class " + handlerStr + " to use for " + "solrContentHandlerFactory",
            config,
            cnfe);
      }
      this.solrContentHandlerFactory =
          getSolrContentHandlerFactory(factoryClass, dateFormats, config);

      this.locale = getLocale(getConfigs().getString(config, "locale", ""));

      this.mediaTypeToParserMap = new HashMap<>();
      // MimeTypes mimeTypes = MimeTypes.getDefaultMimeTypes(); // FIXME
      // getMediaTypeRegistry.normalize()

      List<? extends Config> parserConfigs = getConfigs().getConfigList(config, "parsers");
      for (Config parserConfig : parserConfigs) {
        String parserClassName = getConfigs().getString(parserConfig, "parser");

        Object obj;
        try {
          obj = Class.forName(parserClassName).newInstance();
        } catch (Throwable e) {
          throw new MorphlineCompilationException(
              "Cannot instantiate Tika parser: " + parserClassName, config, e);
        }
        if (!(obj instanceof Parser)) {
          throw new MorphlineCompilationException(
              "Tika parser "
                  + obj.getClass().getName()
                  + " must be an instance of class "
                  + Parser.class.getName(),
              config);
        }
        Parser parser = (Parser) obj;
        this.parsers.add(parser);

        List<String> mediaTypes =
            getConfigs()
                .getStringList(parserConfig, SUPPORTED_MIME_TYPES, Collections.<String>emptyList());
        for (String mediaTypeStr : mediaTypes) {
          MediaType mediaType = parseMediaType(mediaTypeStr);
          addSupportedMimeType(mediaTypeStr);
          this.mediaTypeToParserMap.put(mediaType, parser);
        }

        if (!parserConfig.hasPath(SUPPORTED_MIME_TYPES)) {
          for (MediaType mediaType : parser.getSupportedTypes(new ParseContext())) {
            mediaType = mediaType.getBaseType();
            addSupportedMimeType(mediaType.toString());
            this.mediaTypeToParserMap.put(mediaType, parser);
          }
          List<String> extras =
              getConfigs()
                  .getStringList(
                      parserConfig,
                      ADDITIONAL_SUPPORTED_MIME_TYPES,
                      Collections.<String>emptyList());
          for (String mediaTypeStr : extras) {
            MediaType mediaType = parseMediaType(mediaTypeStr);
            addSupportedMimeType(mediaTypeStr);
            this.mediaTypeToParserMap.put(mediaType, parser);
          }
        }
      }
      // LOG.info("mediaTypeToParserMap="+mediaTypeToParserMap);

      Map<String, String[]> tmp = new HashMap();
      for (Map.Entry<String, Collection<String>> entry : cellParams.asMap().entrySet()) {
        tmp.put(entry.getKey(), entry.getValue().toArray(new String[entry.getValue().size()]));
      }
      this.solrParams = new MultiMapSolrParams(tmp);
      validateArguments();
    }
Beispiel #26
0
  @Override
  public ActorSystem buildActorSystem(Config config) throws Exception {

    // start redis
    final int redisPort =
        (config.hasPath("components.redis.port"))
            ? config.getInt("components.redis.port")
            : RedisURI.DEFAULT_REDIS_PORT;
    final String redisLogLevel =
        config.hasPath("components.redis.log-level")
            ? config.getString("components.redis.log-level")
            : "verbose";
    String logBase = System.getenv("LOG_BASE");
    if (StringUtils.isBlank(logBase)) logBase = System.getenv("TEMP");
    final String redisLogFile =
        config.hasPath("components.redis.log-file")
            ? config.getString("components.redis.log-file")
            : logBase + "\\redis.log";
    final String redisPidFile =
        config.hasPath("components.redis.pid-file")
            ? config.getString("components.redis.pid-file")
            : logBase + "\\redis.pid";

    try {
      this.redis =
          RedisServer.builder()
              .redisExecProvider(RedisExecProvider.defaultProvider())
              .port(redisPort)
              .setting("loglevel " + redisLogLevel)
              .setting("logfile " + redisLogFile)
              .setting("pidfile " + redisPidFile)
              .build();
    } catch (Exception ex) {
      this.logger.error("Fail to build redis server.", ex);
      throw new IllegalStateException("Fail to build redis server.", ex);
    }
    new Thread() {
      @Override
      public void run() {
        try {
          redis.start();
          logger.info("Started redis server on {} port", redisPort);
        } catch (Exception ex) {
          logger.error("Fail to start redis server.", ex);
          // @TODO Use future to stop the actor system at this point.
        }
      }
    }.start();

    // create redis client
    String redisUri = "redis://" + this.getAddress().getHostAddress() + ":" + redisPort + "/0";
    this.redisClient = new RedisClient(RedisURI.create(redisUri));

    ActorSystem system = ActorSystem.create(this.getClusterName(), config);
    Camel camel = CamelExtension.get(system);

    this.baseUrl =
        "http://"
            + this.getAddress().getHostAddress()
            + ":"
            + this.getHttpPort()
            + "/"
            + this.getApplicationName();
    String uri = "jetty:" + this.baseUrl;

    String recorderKeyBase = this.getClusterName() + ":" + "words";
    ActorRef recordingService =
        system.actorOf(
            Props.create(RecordingService.class, recorderKeyBase, this.redisClient),
            "recorderService");

    String tracerKey = this.getClusterName() + ":trace:node:1";
    ActorRef traceLogService =
        system.actorOf(
            Props.create(TraceLogService.class, tracerKey, this.redisClient, this.jacksonMapper),
            "traceLogService");

    ActorRef analysisService =
        system.actorOf(
            Props.create(AnalysisService.class, recordingService, traceLogService),
            "analysisService");

    String pathBase =
        "akka.tcp://" + this.getClusterName() + "@" + this.getAddress().getHostAddress() + ":";
    SimpleRoutingMap<String> routingMap = new SimpleRoutingMap<String>();
    routingMap.putPath(new Key<String>("2551"), pathBase + "2551/user/analysisService");
    routingMap.putPath(new Key<String>("2552"), pathBase + "2552/user/analysisService");

    ActorRef httpClerk =
        system.actorOf(Props.create(WebService.class, uri, routingMap), "httpClerk");

    Future<ActorRef> activationFuture =
        camel.activationFutureFor(
            httpClerk, new Timeout(Duration.create(10, TimeUnit.SECONDS)), system.dispatcher());

    return system;
  }
Beispiel #27
0
  private void initWord2Vec(String name) throws ConfigurationException, IOException, DaoException {
    Config config = getMetricConfig(name).getConfig("generator");
    File model = Word2VecGenerator.getModelFile(config.getString("modelDir"), language);
    if (skipBuiltMetrics && model.isFile()) {
      return;
    }

    if (config.hasPath("prebuilt") && config.getBoolean("prebuilt")) {
      if (model.isFile()) {
        return;
      }
      File downloadPath = new File(config.getString("binfile"));
      if (!downloadPath.isFile()) {
        throw new ConfigurationException(
            "word2vec model "
                + downloadPath.getAbsolutePath()
                + " cannot be found."
                + " You must download it from "
                + config.getString("url")
                + " into to the wikibrain download directory.");
      }
      if (!config.getStringList("languages").contains(language.getLangCode())) {
        throw new ConfigurationException(
            "word2vec model " + downloadPath + " does not support language" + language);
      }
      if (downloadPath.toString().toLowerCase().endsWith("gz")) {
        LOG.info("decompressing " + downloadPath + " to " + model);
        File tmp = File.createTempFile("word2vec", "bin");
        try {
          FileUtils.deleteQuietly(tmp);
          GZIPInputStream gz = new GZIPInputStream(new FileInputStream(downloadPath));
          FileUtils.copyInputStreamToFile(gz, tmp);
          gz.close();
          model.getParentFile().mkdirs();
          FileUtils.moveFile(tmp, model);
        } finally {
          FileUtils.deleteQuietly(tmp);
        }
      } else {
        FileUtils.copyFile(downloadPath, model);
      }
      return;
    }

    LinkProbabilityDao lpd = env.getConfigurator().get(LinkProbabilityDao.class);
    lpd.useCache(true);
    if (!lpd.isBuilt()) {
      lpd.build();
    }

    String corpusName = config.getString("corpus");
    Corpus corpus = null;
    if (!corpusName.equals("NONE")) {
      corpus =
          env.getConfigurator()
              .get(Corpus.class, config.getString("corpus"), "language", language.getLangCode());
      if (!corpus.exists()) {
        corpus.create();
      }
    }

    if (model.isFile()
        && (corpus == null || model.lastModified() > corpus.getCorpusFile().lastModified())) {
      return;
    }
    if (corpus == null) {
      throw new ConfigurationException(
          "word2vec metric "
              + name
              + " cannot build or find model!"
              + "configuration has no corpus, but model not found at "
              + model
              + ".");
    }
    Word2VecTrainer trainer =
        new Word2VecTrainer(env.getConfigurator().get(LocalPageDao.class), language);
    if (config.hasPath("dimensions")) {
      LOG.info("set number of dimensions to " + config.getInt("dimensions"));
      trainer.setLayer1Size(config.getInt("dimensions"));
    }
    if (config.hasPath("maxWords")) {
      LOG.info("set maxWords to " + config.getInt("maxWords"));
      trainer.setMaxWords(config.getInt("maxWords"));
    }
    if (config.hasPath("window")) {
      LOG.info("set window to " + config.getInt("maxWords"));
      trainer.setWindow(config.getInt("window"));
    }
    trainer.setKeepAllArticles(true);
    trainer.train(corpus.getDirectory());
    trainer.save(model);
  }