/** * split the neighbor hood in two groups based on 2 k-means * * @param neighborhood * @return */ private Pair<List<Gene>, List<Gene>> twoMeanClusterSplit(List<Gene> neighborhood) { final int n = neighborhood.size(); final int maxit = desc.getMaxit(); final double eps = desc.getEps(); int a_start = r.nextInt(n); int b_start = r.nextInt(n); Gene a_center = new Gene(1, -1, Arrays.copyOf(neighborhood.get(a_start).data, samples)); Gene b_center = new Gene(1, -1, Arrays.copyOf(neighborhood.get(b_start).data, samples)); float[] a_center_pong = new float[samples]; Arrays.fill(a_center_pong, Float.NaN); float[] b_center_pong = new float[samples]; Arrays.fill(b_center_pong, Float.NaN); float[] tmp; BitSet partOf_a = new BitSet(n); double d_old = 0; for (int i = 0; i < maxit; ++i) { int j = 0; int changed = 0; double d_new = 0; for (Gene gene : neighborhood) { final double a_distance = distance(a_center, gene); final double b_distance = distance(b_center, gene); final boolean in_a = a_distance < b_distance; if (partOf_a.get(j) != in_a) { changed++; partOf_a.set(j, in_a); } d_new += in_a ? a_distance : b_distance; tmp = in_a ? a_center_pong : b_center_pong; // shift new center for (int k = 0; k < samples; ++k) { if (!gene.isNaN(k)) { if (Float.isNaN(tmp[k])) tmp[k] = gene.get(k); else tmp[k] += gene.get(k); } } j++; } if (changed == 0 || d_new == 0) break; final double ratio = Math.abs(d_new - d_old) / d_old; if (i > 0 && ratio < eps) break; d_old = d_new; int a_n = partOf_a.cardinality(); int b_n = n - a_n; if (a_n == 0 || b_n == 0) { // FIXME } updateCenter(a_center, a_center_pong, a_n); updateCenter(b_center, b_center_pong, b_n); } return split(neighborhood, partOf_a); }
private double distance(Gene target, Gene neighbor) { double acc = 0; int n = 0; for (int sample = 0; sample < samples; ++sample) { if (target.isNaN(sample) || neighbor.isNaN(sample)) // skip missing continue; double dx = target.get(sample) - neighbor.get(sample); acc += dx * dx; n++; } if (n > 0) { return acc / n; // FIXME according to the fortran code, this is not the eucledian distance // return Math.sqrt(acc); } return Double.POSITIVE_INFINITY; }
private Sample computeSample(final int sample) { int nans = 0; double sum = 0; int n = 0; for (Gene gene : genes) { double v = gene.get(sample); if (isNaN(v)) nans++; else { sum += v; n++; } } return new Sample(sum / n, nans); }
public double get(int sample) { return gene.get(sample); }