Пример #1
0
  @Override
  protected void loadRecentModel(long mostRecentModelGeneration) throws IOException {
    if (mostRecentModelGeneration <= modelGeneration) {
      return;
    }
    if (modelGeneration == NO_GENERATION) {
      log.info("Most recent generation {} is the first available one", mostRecentModelGeneration);
    } else {
      log.info(
          "Most recent generation {} is newer than current {}",
          mostRecentModelGeneration,
          modelGeneration);
    }

    File modelPMMLFile = File.createTempFile("model-", ".pmml.gz");
    modelPMMLFile.deleteOnExit();
    IOUtils.delete(modelPMMLFile);

    Config config = ConfigUtils.getDefaultConfig();
    String instanceDir = config.getString("model.instance-dir");

    String generationPrefix =
        Namespaces.getInstanceGenerationPrefix(instanceDir, mostRecentModelGeneration);
    String modelPMMLKey = generationPrefix + "model.pmml.gz";
    Store.get().download(modelPMMLKey, modelPMMLFile);
    log.info("Loading model description from {}", modelPMMLKey);

    Pair<DecisionForest, Map<Integer, BiMap<String, Integer>>> forestAndCatalog =
        DecisionForestPMML.read(modelPMMLFile);
    IOUtils.delete(modelPMMLFile);
    log.info("Loaded model description");

    modelGeneration = mostRecentModelGeneration;
    currentModel = new Generation(forestAndCatalog.getFirst(), forestAndCatalog.getSecond());
  }
  @Test
  public void testALSPredictingR() throws Exception {

    ConfigUtils.overlayConfigOnDefault(
        getResourceAsFile("AlternatingLeastSquaresPredictingRTest.conf"));

    RealMatrix product = AlternatingLeastSquaresTest.buildTestXYTProduct();

    assertArrayEquals(
        new float[] {0.0678369f, 0.6574759f, 2.1020291f, 2.0976211f, 0.1115919f},
        product.getRow(0));
    assertArrayEquals(
        new float[] {-0.0176293f, 1.3062225f, 4.1365933f, 4.1739127f, -0.0380586f},
        product.getRow(1));
    assertArrayEquals(
        new float[] {1.0854513f, -0.0344434f, 0.1725342f, -0.1564803f, 1.8502977f},
        product.getRow(2));
    assertArrayEquals(
        new float[] {2.8377915f, 0.0528524f, 0.9041158f, 0.0474437f, 4.8365208f},
        product.getRow(3));
    assertArrayEquals(
        new float[] {
          -0.0057799f, 0.6608552f, 2.0936351f, 2.1115670f, -0.0139042f,
        },
        product.getRow(4));
  }
Пример #3
0
 @Override
 public void addServlets(Context context) {
   addServlet(context, new ClassifyServlet(), "/classify/*");
   if (!ConfigUtils.getDefaultConfig().getBoolean("serving-layer.api.read-only")) {
     addServlet(context, new TrainServlet(), "/train/*");
     addServlet(context, new RefreshServlet(), "/refresh/*");
   }
 }
Пример #4
0
 public CandidateFilterFactory() {
   Config config = ConfigUtils.getDefaultConfig();
   lshSampleRatio = config.getDouble("model.lsh.sample-ratio");
   numHashes = config.getInt("model.lsh.num-hashes");
   candidateFilterClassName =
       config.hasPath("serving-layer.candidate-filter-class")
           ? config.getString("serving-layer.candidate-filter-class")
           : null;
 }
Пример #5
0
  @Override
  protected MRPipeline createPipeline() throws IOException {
    JobStepConfig stepConfig = getConfig();
    ClusterSettings settings = ClusterSettings.create(ConfigUtils.getDefaultConfig());

    String instanceDir = stepConfig.getInstanceDir();
    int generationID = stepConfig.getGenerationID();
    int iteration = stepConfig.getIteration();
    String prefix = Namespaces.getInstanceGenerationPrefix(instanceDir, generationID);
    String outputKey = prefix + String.format("sketch/%d/", iteration);
    if (!validOutputPath(outputKey)) {
      return null;
    }

    // get normalized vectors
    String inputKey = prefix + "normalized/";
    MRPipeline p = createBasicPipeline(DistanceToClosestFn.class);
    AvroType<Pair<Integer, RealVector>> inputType = Avros.pairs(Avros.ints(), MLAvros.vector());
    PCollection<Pair<Integer, RealVector>> in = p.read(avroInput(inputKey, inputType));

    // either create or load the set of currently chosen k-sketch vectors
    // they are stored in a KSketchIndex object
    DistanceToClosestFn<RealVector> distanceToClosestFn;
    UpdateIndexFn updateIndexFn;
    if (iteration
        == 1) { // Iteration 1 is the first real iteration; iteration 0 contains initial state
      KSketchIndex index = createInitialIndex(settings, in);
      distanceToClosestFn = new DistanceToClosestFn<>(index);
      updateIndexFn = new UpdateIndexFn(index);
    } else {
      // Get the index location from the previous iteration
      String previousIndexKey = prefix + String.format("sketch/%d/", iteration - 1);
      distanceToClosestFn = new DistanceToClosestFn<>(previousIndexKey);
      updateIndexFn = new UpdateIndexFn(previousIndexKey);
    }

    // compute distance of each vector in dataset to closest vector in k-sketch
    PTable<Integer, Pair<RealVector, Double>> weighted =
        in.parallelDo(
            "computeDistances",
            distanceToClosestFn,
            Avros.tableOf(Avros.ints(), Avros.pairs(MLAvros.vector(), Avros.doubles())));

    // run weighted reservoir sampling on the vector to select another group of
    // settings.getSketchPoints()
    // to add to the k-sketch
    PTable<Integer, RealVector> kSketchSample =
        ReservoirSampling.groupedWeightedSample(
            weighted, settings.getSketchPoints(), RandomManager.getRandom());

    // update the KSketchIndex with the newly-chosen vectors
    kSketchSample
        .parallelDo("updateIndex", updateIndexFn, Serializables.avro(KSketchIndex.class))
        .write(avroOutput(outputKey));

    return p;
  }
Пример #6
0
 @Override
 public void initialize() {
   super.initialize();
   numRecs = ConfigUtils.getDefaultConfig().getInt("model.recommend.how-many");
   Preconditions.checkArgument(numRecs > 0, "# recommendations must be positive: %s", numRecs);
   try {
     idMapping = new IDMappingState(getConfiguration());
   } catch (IOException e) {
     throw new CrunchRuntimeException(e);
   }
 }
Пример #7
0
 private Namespaces() {
   if (ConfigUtils.getDefaultConfig().getBoolean("model.local")) {
     prefix = "file:";
   } else {
     URI defaultURI = FileSystem.getDefaultUri(new OryxConfiguration());
     String host = defaultURI.getHost();
     int port = defaultURI.getPort();
     if (port > 0) {
       prefix = "hdfs://" + host + ':' + port;
     } else {
       prefix = "hdfs://" + host;
     }
   }
 }
Пример #8
0
 @Override
 public void initialize() {
   super.initialize();
   Config config = ConfigUtils.getDefaultConfig();
   decayFactor = (float) config.getDouble("model.decay.factor");
   Preconditions.checkArgument(
       decayFactor > 0.0f && decayFactor <= 1.0f,
       "Decay factor must be in (0,1]: %s",
       zeroThreshold);
   zeroThreshold = (float) config.getDouble("model.decay.zeroThreshold");
   Preconditions.checkArgument(
       zeroThreshold >= 0.0f, "Zero threshold must be nonnegative: %s", zeroThreshold);
   doDecay = decayFactor < 1.0f;
 }
Пример #9
0
  @Override
  protected MRPipeline createPipeline() throws IOException {
    JobStepConfig stepConfig = getConfig();
    Config config = ConfigUtils.getDefaultConfig();
    ClusterSettings clusterSettings = ClusterSettings.create(config);

    String instanceDir = stepConfig.getInstanceDir();
    long generationID = stepConfig.getGenerationID();
    String prefix = Namespaces.getInstanceGenerationPrefix(instanceDir, generationID);
    String outputKey = prefix + "weighted/";
    if (!validOutputPath(outputKey)) {
      return null;
    }

    String indexKey = prefix + "sketch/" + clusterSettings.getSketchIterations();
    String inputKey = prefix + "normalized/";
    MRPipeline p = createBasicPipeline(ClosestSketchVectorFn.class);

    // first I compute the weight of each k-sketch vector, i.e., Voronoi partition
    // I aggregate all together and persist on disk
    // PCollection<ClosestSketchVectorData> weights = inputPairs(p, inputKey, MLAvros.vector())
    PCollection<ClosestSketchVectorData> weights =
        PTables.asPTable(
                inputPairs(p, inputKey, MLAvros.vector())
                    .parallelDo(
                        "computingSketchVectorWeights",
                        new ClosestSketchVectorFn<RealVector>(indexKey, clusterSettings),
                        Avros.pairs(Avros.ints(), Avros.reflects(ClosestSketchVectorData.class))))
            .groupByKey(1)
            .combineValues(new ClosestSketchVectorAggregator(clusterSettings))
            .values()
            .write(avroOutput(outputKey + "kSketchVectorWeights/"));

    // this "pipeline" takes a single ClosestSketchVectorData and returns weighted vectors
    // could be done outside MapReduce, but that would require me to materialize the
    // ClosestSketchVectorData
    weights
        .parallelDo(
            "generatingWeightedSketchVectors",
            new WeightVectorsFn(indexKey),
            KMeansTypes.FOLD_WEIGHTED_VECTOR)
        .write(avroOutput(outputKey + "weightedKSketchVectors/"));

    return p;
  }
Пример #10
0
 protected MockServingModelManager getModelManager() {
   return new MockServingModelManager(ConfigUtils.getDefault());
 }
Пример #11
0
  @Test
  public void testALSSpeed() throws Exception {
    Map<String, Object> overlayConfig = new HashMap<>();
    overlayConfig.put("oryx.speed.model-manager-class", ALSSpeedModelManager.class.getName());
    overlayConfig.put("oryx.speed.streaming.generation-interval-sec", 5);
    overlayConfig.put("oryx.als.hyperparams.features", 2);
    Config config = ConfigUtils.overlayOn(overlayConfig, getConfig());

    startMessaging();

    List<Pair<String, String>> updates =
        startServerProduceConsumeTopics(
            config, new MockALSInputGenerator(), new MockALSModelUpdateGenerator(), 9, 10);

    if (log.isDebugEnabled()) {
      for (Pair<String, String> update : updates) {
        log.debug("{}", update);
      }
    }

    // 10 original updates. 9 generate just 1 update since user or item is new.
    assertEquals(19, updates.size());
    assertEquals("MODEL", updates.get(0).getFirst());
    assertEquals(
        2,
        Integer.parseInt(
            AppPMMLUtils.getExtensionValue(
                PMMLUtils.fromString(updates.get(0).getSecond()), "features")));

    for (int i = 1; i <= 9; i++) {
      assertEquals("UP", updates.get(i).getFirst());
      List<?> update = MAPPER.readValue(updates.get(i).getSecond(), List.class);
      boolean isX = "X".equals(update.get(0).toString());
      String id = update.get(1).toString();
      float[] expected =
          (isX ? MockALSModelUpdateGenerator.X : MockALSModelUpdateGenerator.Y).get(id);
      assertArrayEquals(expected, MAPPER.convertValue(update.get(2), float[].class));
      @SuppressWarnings("unchecked")
      Collection<String> knownUsersItems = (Collection<String>) update.get(3);
      Collection<String> expectedKnownUsersItems =
          (isX ? MockALSModelUpdateGenerator.A : MockALSModelUpdateGenerator.At).get(id);
      assertTrue(knownUsersItems.containsAll(expectedKnownUsersItems));
      assertTrue(expectedKnownUsersItems.containsAll(knownUsersItems));
    }

    /*
     * User 100 - 104 are solutions to eye(5)*Y*pinv(Y'*Y), but default scaling
     * will produce values that are 3/4 of this since they are brand new.
     * That is, it's really the solution to (0.75*eye(5))*Y*pinv(Y'*Y)
     * Likewise 105 - 108 are (0.75*eye(4))*X*pinv(X'*X)
     */

    Map<String, float[]> X =
        MockALSModelUpdateGenerator.buildMatrix(
            100,
            new float[][] {
              {-0.20859924f, 0.25232133f},
              {-0.22472803f, -0.1929485f},
              {-0.15592135f, 0.3977631f},
              {-0.3006522f, -0.12239703f},
              {-0.09205295f, -0.37471837f},
            });
    Map<String, float[]> Y =
        MockALSModelUpdateGenerator.buildMatrix(
            105,
            new float[][] {
              {-0.19663288f, 0.09574106f},
              {-0.23840417f, -0.50850725f},
              {-0.34360975f, 0.2466687f},
              {-0.060204573f, 0.29311115f},
            });

    for (int i = 10; i <= 18; i++) {
      assertEquals("UP", updates.get(i).getFirst());
      List<?> update = MAPPER.readValue(updates.get(i).getSecond(), List.class);
      boolean isX = "X".equals(update.get(0).toString());
      String id = update.get(1).toString();
      float[] expected = (isX ? X : Y).get(id);
      assertArrayEquals(expected, MAPPER.convertValue(update.get(2), float[].class), 1.0e-5f);
      String otherID = ALSUtilsTest.idToStringID(ALSUtilsTest.stringIDtoID(id) - 99);
      @SuppressWarnings("unchecked")
      Collection<String> knownUsersItems = (Collection<String>) update.get(3);
      assertEquals(1, knownUsersItems.size());
      assertEquals(otherID, knownUsersItems.iterator().next());
    }
  }
Пример #12
0
 public RDFGenerationManager(File appendTempDir) throws IOException {
   super(appendTempDir);
   modelGeneration = NO_GENERATION;
   Config config = ConfigUtils.getDefaultConfig();
   disableWriteUpdates = config.getBoolean("serving-layer.disable-write-updates");
 }