/** * Performs SOTA tree construction given parameters provided in <code>AlgorithmData</code>. * Results are returned in AlgorthmData */ public AlgorithmData execute(AlgorithmData data) throws AlgorithmException { // Get parameters AlgorithmParameters params = data.getParams(); sotaGenes = params.getBoolean("sota-cluster-genes", true); maxNumEpochs = params.getInt("max-epochs-per-cycle", 1000); maxNumCycles = params.getInt("max-number-of-cycles", 10); epochCriteria = params.getFloat("epoch-improvement-cutoff"); endCriteria = params.getFloat("end-training-diversity"); runToMaxCycles = params.getBoolean("run-to-max-cycles"); useClusterVariance = params.getBoolean("use-cluster-variance", false); function = params.getInt("distance-function", EUCLIDEAN); absolute = params.getBoolean("distance-absolute", true); calcClusterHCL = params.getBoolean("calcClusterHCL", false); calculate_genes = params.getBoolean("calculate-genes", false); calculate_experiments = params.getBoolean("calculate-experiments", false); calcFullTreeHCL = params.getBoolean("calcFullTreeHCL", false); method = params.getInt("method-linkage", 0); pValue = params.getFloat("pValue", (float) 0.05); migW = params.getFloat("mig_w", (float) 0.01); migP = params.getFloat("mig_p", (float) 0.005); migS = params.getFloat("mig_s", (float) 0.001); neighborhoodLevel = params.getInt("neighborhood-level", 5); hcl_function = params.getInt("hcl-distance-function", EUCLIDEAN); hcl_absolute = params.getBoolean("hcl-distance-absolute", false); inData = data; // keep a handle on AlgorithmData for return // Set factor based on function if ((function == PEARSON) || (function == PEARSONUNCENTERED) || (function == PEARSONSQARED) || (function == COSINE) || (function == COVARIANCE) || (function == DOTPRODUCT) || (function == SPEARMANRANK) || (function == KENDALLSTAU)) { myFactor = -1.0f; } else { myFactor = 1.0f; } factor = (float) 1.0; // scaling factor sent to getDistance methods inData.addParam("factor", String.valueOf(myFactor)); // return factor endCriteria *= myFactor; // alter polarity fo endCriteria based on metric treeDiversity = Float.POSITIVE_INFINITY; dataMatrix = data.getMatrix("experiment"); // point dataMatrix at supplied matrix numberOfGenes = dataMatrix.getRowDimension(); numberOfSamples = dataMatrix.getColumnDimension(); myNucleus = new SOTACell[numberOfGenes]; // will be shortcut from gene index to a cell cycleDiversity = new Vector(); // reset max number of cycles if limited by number of genes if (maxNumCycles >= numberOfGenes) maxNumCycles = numberOfGenes - 1; // if using variablility, resample data, select cutoff based on p value supplied if (useClusterVariance) { endCriteria = resampleAndGetNewCutoff(dataMatrix, pValue); } // initialize first cell and two children root = new SOTACell(numberOfSamples, dataMatrix); root.right = new SOTACell(numberOfSamples, dataMatrix); root.left = new SOTACell(numberOfSamples, dataMatrix); numberOfClusters = 2; root.left.parent = root; root.right.parent = root; head = root.left; root.left.succ = root.right; root.right.pred = root.left; int[] numberOfValidGenesInSample = new int[numberOfSamples]; // set to zero for (int i = 0; i < numberOfSamples; i++) numberOfValidGenesInSample[i] = 0; // Inialize centroid root centroid to zeros for (int i = 0; i < numberOfSamples; i++) { root.centroidGene.set(0, i, 0); } for (int i = 0; i < numberOfGenes; i++) { root.members.add(new Integer(i)); // add all gene indices to root myNucleus[i] = root; // set all gene nuclei to point to root for (int j = 0; j < numberOfSamples; j++) { if (!(Float.isNaN(dataMatrix.get(i, j)))) { numberOfValidGenesInSample[j]++; // count number of genes with valid data in each sample root.centroidGene.set( 0, j, root.centroidGene.get(0, j) + dataMatrix.get(i, j)); // calcualtes sum } } } mostDiverseCell = root; mostVariableCell = root; for (int j = 0; j < numberOfSamples; j++) { root.centroidGene.set( 0, j, root.centroidGene.get(0, j) / numberOfValidGenesInSample[j]); // get a mean root centroid root.left.centroidGene.set(0, j, root.centroidGene.get(0, j)); // assign to children root.right.centroidGene.set(0, j, root.centroidGene.get(0, j)); } // put first value into diversity vector initDivSum = getNodeDiversitySum(root); cycleDiversity.add(new Float(initDivSum)); root.cellDiversity = initDivSum / numberOfGenes; if (useClusterVariance) root.cellVariance = getNodeVariance(root); if (runToMaxCycles) growSOTUnrestricted(); // make tree w/o regard to diversity else growSOT(); // Construct tree // If performing HCL on samples using all genes if (calcFullTreeHCL) { calcFullTreeHCL(); } // Code for HCL clustering if (calcClusterHCL) { calculateClusterHCL(); // calculate HCL trees for SOTA clusters } return inData; // inData has results incorporated }
// sets cell diversities, and variances (if required) private void setDiversities() { SOTACell curr = head; double cellSum = 0; double cellVar = 0; double treeSum = 0; double maxCellDiv = -1; double maxCellVar = -1; int numberOfCells = 0; double currDist = 0; mostDiverseCell = head; mostVariableCell = head; while (curr != null) { numberOfCells++; cellSum = 0; // for all members of the node get distance to set cell resource (diversity) for (int i = 0; i < curr.members.size(); i++) { cellSum += ExperimentUtil.geneDistance( dataMatrix, curr.centroidGene, ((Integer) (curr.members.elementAt(i))).intValue(), 0, function, factor, absolute); } curr.cellDiversity = (cellSum / curr.members.size()); if (curr.cellDiversity > maxCellDiv && curr.members.size() > 1) { maxCellDiv = curr.cellDiversity; mostDiverseCell = curr; } treeSum += cellSum; if (useClusterVariance) { // using cell variance, need to find mostVariable cell cellVar = 0; currDist = 0; // get cell varience // if new members have been added if (curr.changedMembership) { // use max gene to gene distance for (int i = 0; i < curr.members.size(); i++) { for (int j = 0; j < curr.members.size(); j++) { currDist = ExperimentUtil.geneDistance( dataMatrix, null, ((Integer) (curr.members.elementAt(i))).intValue(), ((Integer) (curr.members.elementAt(j))).intValue(), function, factor, absolute); // get max dist. to be cellVar if (currDist > cellVar) { cellVar = currDist; } } } curr.cellVariance = cellVar; } else // no change to membership so we dont hve to recalculate variance cellVar = curr.cellVariance; if (cellVar > maxCellVar && curr.members.size() > 1) { maxCellVar = cellVar; mostVariableCell = curr; } } curr.changedMembership = false; // variance already set for current population curr = curr.succ; } treeDiversity = treeSum; }