@Override protected Table<Integer, Integer, Float> compute() { // data // An expression matrix with genes in the rows, samples in the columns // k // Number of neighbors to be used in the imputation (default=10) // rowmax // The maximum percent missing data allowed in any row (default 50%). For any // rows with more than rowmax% missing are imputed using the overall mean per // sample. // colmax // The maximum percent missing data allowed in any column (default 80%). If // any column has more than colmax% missing data, the program halts and reports // an error. // maxp // The largest block of genes imputed using the knn algorithm inside impute.knn // (default 1500); larger blocks are divided by two-means clustering (recursively) // prior to imputation. If maxp=p, only knn imputation is done. // rng.seed // The seed used for the random number generator (default 362436069) for repro- // ducibility. // impute.knn uses k-nearest neighbors in the space of genes to impute missing expression // values. // For each gene with missing values, we find the k nearest neighbors using a Euclidean metric, // con- // fined to the columns for which that gene is NOT missing. Each candidate neighbor might be // missing // some of the coordinates used to calculate the distance. In this case we average the distance // from // the non-missing coordinates. Having found the k nearest neighbors for a gene, we impute the // miss- // ing elements by averaging those (non-missing) elements of its neighbors. This can fail if // ALL the // neighbors are missing in a particular element. In this case we use the overall column mean // for that // block of genes. // Since nearest neighbor imputation costs O(plog(p)) operations per gene, where p is the // number // of rows, the computational time can be excessive for large p and a large number of missing // rows. // Our strategy is to break blocks with more than maxp genes into two smaller blocks using // two-mean // clustering. This is done recursively till all blocks have less than maxp genes. For each // block, k- // nearest neighbor imputation is done separately. We have set the default value of maxp to // 1500. // Depending on the speed of the machine, and number of samples, this number might be // increased. // Making it too small is counter-productive, because the number of two-mean clustering // algorithms // will increase. if (toomanyNaNsInAColumn()) throw new IllegalStateException(); final float rowMax = desc.getRowmax(); final boolean validRowMax = !Float.isInfinite(rowMax) && !Float.isNaN(rowMax); final int max = validRowMax ? Math.round(desc.getRowmax() * samples) : 0; // list of possible List<Gene> neighborhood; int withMissing = 0; Collection<ForkJoinTask<Void>> tasks = new ArrayList<>(); if (!validRowMax) { neighborhood = genes; // all genes } else { neighborhood = new ArrayList<>(genes.size()); for (Gene gene : genes) { if (gene.getNaNs() == 0) { // nothing to impute neighborhood.add(gene); } else if (validRowMax && gene.getNaNs() > max) { // too many nans use the sample mean tasks.add(new ImputeSampleMean(gene)); // not a good neighbor } else { // neighbor but something needs to be done neighborhood.add(gene); withMissing++; } } } if (withMissing > 0) tasks.add(new ImputeKNNMean(neighborhood)); invokeAll(tasks); ImmutableTable.Builder<Integer, Integer, Float> b = ImmutableTable.builder(); for (Gene gene : genes) { if (gene.isAnySet()) { gene.fillImpute(b); } } return b.build(); }