@Override protected void loadRecentModel(long mostRecentModelGeneration) throws IOException { if (mostRecentModelGeneration <= modelGeneration) { return; } if (modelGeneration == NO_GENERATION) { log.info("Most recent generation {} is the first available one", mostRecentModelGeneration); } else { log.info( "Most recent generation {} is newer than current {}", mostRecentModelGeneration, modelGeneration); } File modelPMMLFile = File.createTempFile("model-", ".pmml.gz"); modelPMMLFile.deleteOnExit(); IOUtils.delete(modelPMMLFile); Config config = ConfigUtils.getDefaultConfig(); String instanceDir = config.getString("model.instance-dir"); String generationPrefix = Namespaces.getInstanceGenerationPrefix(instanceDir, mostRecentModelGeneration); String modelPMMLKey = generationPrefix + "model.pmml.gz"; Store.get().download(modelPMMLKey, modelPMMLFile); log.info("Loading model description from {}", modelPMMLKey); Pair<DecisionForest, Map<Integer, BiMap<String, Integer>>> forestAndCatalog = DecisionForestPMML.read(modelPMMLFile); IOUtils.delete(modelPMMLFile); log.info("Loaded model description"); modelGeneration = mostRecentModelGeneration; currentModel = new Generation(forestAndCatalog.getFirst(), forestAndCatalog.getSecond()); }
@Test public void testALSPredictingR() throws Exception { ConfigUtils.overlayConfigOnDefault( getResourceAsFile("AlternatingLeastSquaresPredictingRTest.conf")); RealMatrix product = AlternatingLeastSquaresTest.buildTestXYTProduct(); assertArrayEquals( new float[] {0.0678369f, 0.6574759f, 2.1020291f, 2.0976211f, 0.1115919f}, product.getRow(0)); assertArrayEquals( new float[] {-0.0176293f, 1.3062225f, 4.1365933f, 4.1739127f, -0.0380586f}, product.getRow(1)); assertArrayEquals( new float[] {1.0854513f, -0.0344434f, 0.1725342f, -0.1564803f, 1.8502977f}, product.getRow(2)); assertArrayEquals( new float[] {2.8377915f, 0.0528524f, 0.9041158f, 0.0474437f, 4.8365208f}, product.getRow(3)); assertArrayEquals( new float[] { -0.0057799f, 0.6608552f, 2.0936351f, 2.1115670f, -0.0139042f, }, product.getRow(4)); }
@Override public void addServlets(Context context) { addServlet(context, new ClassifyServlet(), "/classify/*"); if (!ConfigUtils.getDefaultConfig().getBoolean("serving-layer.api.read-only")) { addServlet(context, new TrainServlet(), "/train/*"); addServlet(context, new RefreshServlet(), "/refresh/*"); } }
public CandidateFilterFactory() { Config config = ConfigUtils.getDefaultConfig(); lshSampleRatio = config.getDouble("model.lsh.sample-ratio"); numHashes = config.getInt("model.lsh.num-hashes"); candidateFilterClassName = config.hasPath("serving-layer.candidate-filter-class") ? config.getString("serving-layer.candidate-filter-class") : null; }
@Override protected MRPipeline createPipeline() throws IOException { JobStepConfig stepConfig = getConfig(); ClusterSettings settings = ClusterSettings.create(ConfigUtils.getDefaultConfig()); String instanceDir = stepConfig.getInstanceDir(); int generationID = stepConfig.getGenerationID(); int iteration = stepConfig.getIteration(); String prefix = Namespaces.getInstanceGenerationPrefix(instanceDir, generationID); String outputKey = prefix + String.format("sketch/%d/", iteration); if (!validOutputPath(outputKey)) { return null; } // get normalized vectors String inputKey = prefix + "normalized/"; MRPipeline p = createBasicPipeline(DistanceToClosestFn.class); AvroType<Pair<Integer, RealVector>> inputType = Avros.pairs(Avros.ints(), MLAvros.vector()); PCollection<Pair<Integer, RealVector>> in = p.read(avroInput(inputKey, inputType)); // either create or load the set of currently chosen k-sketch vectors // they are stored in a KSketchIndex object DistanceToClosestFn<RealVector> distanceToClosestFn; UpdateIndexFn updateIndexFn; if (iteration == 1) { // Iteration 1 is the first real iteration; iteration 0 contains initial state KSketchIndex index = createInitialIndex(settings, in); distanceToClosestFn = new DistanceToClosestFn<>(index); updateIndexFn = new UpdateIndexFn(index); } else { // Get the index location from the previous iteration String previousIndexKey = prefix + String.format("sketch/%d/", iteration - 1); distanceToClosestFn = new DistanceToClosestFn<>(previousIndexKey); updateIndexFn = new UpdateIndexFn(previousIndexKey); } // compute distance of each vector in dataset to closest vector in k-sketch PTable<Integer, Pair<RealVector, Double>> weighted = in.parallelDo( "computeDistances", distanceToClosestFn, Avros.tableOf(Avros.ints(), Avros.pairs(MLAvros.vector(), Avros.doubles()))); // run weighted reservoir sampling on the vector to select another group of // settings.getSketchPoints() // to add to the k-sketch PTable<Integer, RealVector> kSketchSample = ReservoirSampling.groupedWeightedSample( weighted, settings.getSketchPoints(), RandomManager.getRandom()); // update the KSketchIndex with the newly-chosen vectors kSketchSample .parallelDo("updateIndex", updateIndexFn, Serializables.avro(KSketchIndex.class)) .write(avroOutput(outputKey)); return p; }
@Override public void initialize() { super.initialize(); numRecs = ConfigUtils.getDefaultConfig().getInt("model.recommend.how-many"); Preconditions.checkArgument(numRecs > 0, "# recommendations must be positive: %s", numRecs); try { idMapping = new IDMappingState(getConfiguration()); } catch (IOException e) { throw new CrunchRuntimeException(e); } }
private Namespaces() { if (ConfigUtils.getDefaultConfig().getBoolean("model.local")) { prefix = "file:"; } else { URI defaultURI = FileSystem.getDefaultUri(new OryxConfiguration()); String host = defaultURI.getHost(); int port = defaultURI.getPort(); if (port > 0) { prefix = "hdfs://" + host + ':' + port; } else { prefix = "hdfs://" + host; } } }
@Override public void initialize() { super.initialize(); Config config = ConfigUtils.getDefaultConfig(); decayFactor = (float) config.getDouble("model.decay.factor"); Preconditions.checkArgument( decayFactor > 0.0f && decayFactor <= 1.0f, "Decay factor must be in (0,1]: %s", zeroThreshold); zeroThreshold = (float) config.getDouble("model.decay.zeroThreshold"); Preconditions.checkArgument( zeroThreshold >= 0.0f, "Zero threshold must be nonnegative: %s", zeroThreshold); doDecay = decayFactor < 1.0f; }
@Override protected MRPipeline createPipeline() throws IOException { JobStepConfig stepConfig = getConfig(); Config config = ConfigUtils.getDefaultConfig(); ClusterSettings clusterSettings = ClusterSettings.create(config); String instanceDir = stepConfig.getInstanceDir(); long generationID = stepConfig.getGenerationID(); String prefix = Namespaces.getInstanceGenerationPrefix(instanceDir, generationID); String outputKey = prefix + "weighted/"; if (!validOutputPath(outputKey)) { return null; } String indexKey = prefix + "sketch/" + clusterSettings.getSketchIterations(); String inputKey = prefix + "normalized/"; MRPipeline p = createBasicPipeline(ClosestSketchVectorFn.class); // first I compute the weight of each k-sketch vector, i.e., Voronoi partition // I aggregate all together and persist on disk // PCollection<ClosestSketchVectorData> weights = inputPairs(p, inputKey, MLAvros.vector()) PCollection<ClosestSketchVectorData> weights = PTables.asPTable( inputPairs(p, inputKey, MLAvros.vector()) .parallelDo( "computingSketchVectorWeights", new ClosestSketchVectorFn<RealVector>(indexKey, clusterSettings), Avros.pairs(Avros.ints(), Avros.reflects(ClosestSketchVectorData.class)))) .groupByKey(1) .combineValues(new ClosestSketchVectorAggregator(clusterSettings)) .values() .write(avroOutput(outputKey + "kSketchVectorWeights/")); // this "pipeline" takes a single ClosestSketchVectorData and returns weighted vectors // could be done outside MapReduce, but that would require me to materialize the // ClosestSketchVectorData weights .parallelDo( "generatingWeightedSketchVectors", new WeightVectorsFn(indexKey), KMeansTypes.FOLD_WEIGHTED_VECTOR) .write(avroOutput(outputKey + "weightedKSketchVectors/")); return p; }
protected MockServingModelManager getModelManager() { return new MockServingModelManager(ConfigUtils.getDefault()); }
@Test public void testALSSpeed() throws Exception { Map<String, Object> overlayConfig = new HashMap<>(); overlayConfig.put("oryx.speed.model-manager-class", ALSSpeedModelManager.class.getName()); overlayConfig.put("oryx.speed.streaming.generation-interval-sec", 5); overlayConfig.put("oryx.als.hyperparams.features", 2); Config config = ConfigUtils.overlayOn(overlayConfig, getConfig()); startMessaging(); List<Pair<String, String>> updates = startServerProduceConsumeTopics( config, new MockALSInputGenerator(), new MockALSModelUpdateGenerator(), 9, 10); if (log.isDebugEnabled()) { for (Pair<String, String> update : updates) { log.debug("{}", update); } } // 10 original updates. 9 generate just 1 update since user or item is new. assertEquals(19, updates.size()); assertEquals("MODEL", updates.get(0).getFirst()); assertEquals( 2, Integer.parseInt( AppPMMLUtils.getExtensionValue( PMMLUtils.fromString(updates.get(0).getSecond()), "features"))); for (int i = 1; i <= 9; i++) { assertEquals("UP", updates.get(i).getFirst()); List<?> update = MAPPER.readValue(updates.get(i).getSecond(), List.class); boolean isX = "X".equals(update.get(0).toString()); String id = update.get(1).toString(); float[] expected = (isX ? MockALSModelUpdateGenerator.X : MockALSModelUpdateGenerator.Y).get(id); assertArrayEquals(expected, MAPPER.convertValue(update.get(2), float[].class)); @SuppressWarnings("unchecked") Collection<String> knownUsersItems = (Collection<String>) update.get(3); Collection<String> expectedKnownUsersItems = (isX ? MockALSModelUpdateGenerator.A : MockALSModelUpdateGenerator.At).get(id); assertTrue(knownUsersItems.containsAll(expectedKnownUsersItems)); assertTrue(expectedKnownUsersItems.containsAll(knownUsersItems)); } /* * User 100 - 104 are solutions to eye(5)*Y*pinv(Y'*Y), but default scaling * will produce values that are 3/4 of this since they are brand new. * That is, it's really the solution to (0.75*eye(5))*Y*pinv(Y'*Y) * Likewise 105 - 108 are (0.75*eye(4))*X*pinv(X'*X) */ Map<String, float[]> X = MockALSModelUpdateGenerator.buildMatrix( 100, new float[][] { {-0.20859924f, 0.25232133f}, {-0.22472803f, -0.1929485f}, {-0.15592135f, 0.3977631f}, {-0.3006522f, -0.12239703f}, {-0.09205295f, -0.37471837f}, }); Map<String, float[]> Y = MockALSModelUpdateGenerator.buildMatrix( 105, new float[][] { {-0.19663288f, 0.09574106f}, {-0.23840417f, -0.50850725f}, {-0.34360975f, 0.2466687f}, {-0.060204573f, 0.29311115f}, }); for (int i = 10; i <= 18; i++) { assertEquals("UP", updates.get(i).getFirst()); List<?> update = MAPPER.readValue(updates.get(i).getSecond(), List.class); boolean isX = "X".equals(update.get(0).toString()); String id = update.get(1).toString(); float[] expected = (isX ? X : Y).get(id); assertArrayEquals(expected, MAPPER.convertValue(update.get(2), float[].class), 1.0e-5f); String otherID = ALSUtilsTest.idToStringID(ALSUtilsTest.stringIDtoID(id) - 99); @SuppressWarnings("unchecked") Collection<String> knownUsersItems = (Collection<String>) update.get(3); assertEquals(1, knownUsersItems.size()); assertEquals(otherID, knownUsersItems.iterator().next()); } }
public RDFGenerationManager(File appendTempDir) throws IOException { super(appendTempDir); modelGeneration = NO_GENERATION; Config config = ConfigUtils.getDefaultConfig(); disableWriteUpdates = config.getBoolean("serving-layer.disable-write-updates"); }