// returns the distribution of trainingNode memeber genes among left and right children private Dimension runNodeEpoch(SOTACell trainingNode) { SOTACell myCell = null; SOTACell sisterCell = null; int rightCnt = 0; int leftCnt = 0; int memberGene = 0; // for all genes in the training node, find closest child, migrate child for (int geneNum = 0; geneNum < trainingNode.members.size(); geneNum++) { memberGene = ((Integer) trainingNode.members.elementAt(geneNum)).intValue(); myCell = findMyDaughterCell(trainingNode, memberGene); // only look among children // dont add to membership // later make sure that left and right membership set is not null if (myCell == trainingNode.left) leftCnt++; else rightCnt++; myCell.migrateCentroid(memberGene, migW); sisterCell = findSister(myCell); // if sister has no offspring then migrate parent and sister if (sisterCell.left == null && sisterCell.right == null) { myCell.parent.migrateCentroid(memberGene, migP); sisterCell.migrateCentroid(memberGene, migS); } } return new Dimension(leftCnt, rightCnt); }
private void divideCell(SOTACell cellToDivide) { float[] parentCentroid; cellToDivide.left = new SOTACell(numberOfSamples, dataMatrix); cellToDivide.right = new SOTACell(numberOfSamples, dataMatrix); numberOfClusters++; cellToDivide.left.parent = cellToDivide; cellToDivide.right.parent = cellToDivide; cellToDivide.right.pred = cellToDivide.left; cellToDivide.left.succ = cellToDivide.right; if (cellToDivide.pred != null) { cellToDivide.left.pred = cellToDivide.pred; cellToDivide.left.pred.succ = cellToDivide.left; } else cellToDivide.left.pred = null; if (cellToDivide.succ != null) { cellToDivide.right.succ = cellToDivide.succ; cellToDivide.right.succ.pred = cellToDivide.right; } else cellToDivide.right.succ = null; if (cellToDivide == head) head = cellToDivide.left; cellToDivide.succ = null; cellToDivide.pred = null; for (int i = 0; i < numberOfSamples; i++) { cellToDivide.left.centroidGene.set(0, i, cellToDivide.centroidGene.get(0, i)); cellToDivide.right.centroidGene.set(0, i, cellToDivide.centroidGene.get(0, i)); } }
private SOTACell findMyCell(int geneNum) { SOTACell curr = head; SOTACell myClosestCell = head; double keyDist = Float.POSITIVE_INFINITY; double currDist = 0; while (curr != null) { currDist = ExperimentUtil.geneDistance( dataMatrix, curr.centroidGene, geneNum, 0, function, factor, absolute); if (currDist <= keyDist) { keyDist = currDist; myClosestCell = curr; } curr = curr.succ; } if (myNucleus[geneNum] != myClosestCell) { myNucleus[geneNum] = myClosestCell; myClosestCell.addMember(geneNum); } return myClosestCell; }
private SOTACell findMyCellInSubTree(SOTACell trainingCell, int geneNum, int level) { SOTACell currCell = trainingCell; SOTACell myCell = trainingCell; int levelIndex = 0; while (currCell.parent != null && levelIndex < level) { currCell = currCell.parent; levelIndex++; } // now currNode is at root, or 'level' number of nodes above the training node Vector cellList = new Vector(); getCellsBelow(cellList, currCell); float minDist = Float.POSITIVE_INFINITY; float currDist; for (int i = 0; i < cellList.size(); i++) { currCell = (SOTACell) (cellList.elementAt(i)); currDist = ExperimentUtil.geneDistance( dataMatrix, currCell.centroidGene, geneNum, 0, function, factor, absolute); if (currDist < minDist) { minDist = currDist; myCell = currCell; } } if (myNucleus[geneNum] != myCell) { myNucleus[geneNum] = myCell; myCell.addMember(geneNum); } return myCell; }
/** * Performs SOTA tree construction given parameters provided in <code>AlgorithmData</code>. * Results are returned in AlgorthmData */ public AlgorithmData execute(AlgorithmData data) throws AlgorithmException { // Get parameters AlgorithmParameters params = data.getParams(); sotaGenes = params.getBoolean("sota-cluster-genes", true); maxNumEpochs = params.getInt("max-epochs-per-cycle", 1000); maxNumCycles = params.getInt("max-number-of-cycles", 10); epochCriteria = params.getFloat("epoch-improvement-cutoff"); endCriteria = params.getFloat("end-training-diversity"); runToMaxCycles = params.getBoolean("run-to-max-cycles"); useClusterVariance = params.getBoolean("use-cluster-variance", false); function = params.getInt("distance-function", EUCLIDEAN); absolute = params.getBoolean("distance-absolute", true); calcClusterHCL = params.getBoolean("calcClusterHCL", false); calculate_genes = params.getBoolean("calculate-genes", false); calculate_experiments = params.getBoolean("calculate-experiments", false); calcFullTreeHCL = params.getBoolean("calcFullTreeHCL", false); method = params.getInt("method-linkage", 0); pValue = params.getFloat("pValue", (float) 0.05); migW = params.getFloat("mig_w", (float) 0.01); migP = params.getFloat("mig_p", (float) 0.005); migS = params.getFloat("mig_s", (float) 0.001); neighborhoodLevel = params.getInt("neighborhood-level", 5); hcl_function = params.getInt("hcl-distance-function", EUCLIDEAN); hcl_absolute = params.getBoolean("hcl-distance-absolute", false); inData = data; // keep a handle on AlgorithmData for return // Set factor based on function if ((function == PEARSON) || (function == PEARSONUNCENTERED) || (function == PEARSONSQARED) || (function == COSINE) || (function == COVARIANCE) || (function == DOTPRODUCT) || (function == SPEARMANRANK) || (function == KENDALLSTAU)) { myFactor = -1.0f; } else { myFactor = 1.0f; } factor = (float) 1.0; // scaling factor sent to getDistance methods inData.addParam("factor", String.valueOf(myFactor)); // return factor endCriteria *= myFactor; // alter polarity fo endCriteria based on metric treeDiversity = Float.POSITIVE_INFINITY; dataMatrix = data.getMatrix("experiment"); // point dataMatrix at supplied matrix numberOfGenes = dataMatrix.getRowDimension(); numberOfSamples = dataMatrix.getColumnDimension(); myNucleus = new SOTACell[numberOfGenes]; // will be shortcut from gene index to a cell cycleDiversity = new Vector(); // reset max number of cycles if limited by number of genes if (maxNumCycles >= numberOfGenes) maxNumCycles = numberOfGenes - 1; // if using variablility, resample data, select cutoff based on p value supplied if (useClusterVariance) { endCriteria = resampleAndGetNewCutoff(dataMatrix, pValue); } // initialize first cell and two children root = new SOTACell(numberOfSamples, dataMatrix); root.right = new SOTACell(numberOfSamples, dataMatrix); root.left = new SOTACell(numberOfSamples, dataMatrix); numberOfClusters = 2; root.left.parent = root; root.right.parent = root; head = root.left; root.left.succ = root.right; root.right.pred = root.left; int[] numberOfValidGenesInSample = new int[numberOfSamples]; // set to zero for (int i = 0; i < numberOfSamples; i++) numberOfValidGenesInSample[i] = 0; // Inialize centroid root centroid to zeros for (int i = 0; i < numberOfSamples; i++) { root.centroidGene.set(0, i, 0); } for (int i = 0; i < numberOfGenes; i++) { root.members.add(new Integer(i)); // add all gene indices to root myNucleus[i] = root; // set all gene nuclei to point to root for (int j = 0; j < numberOfSamples; j++) { if (!(Float.isNaN(dataMatrix.get(i, j)))) { numberOfValidGenesInSample[j]++; // count number of genes with valid data in each sample root.centroidGene.set( 0, j, root.centroidGene.get(0, j) + dataMatrix.get(i, j)); // calcualtes sum } } } mostDiverseCell = root; mostVariableCell = root; for (int j = 0; j < numberOfSamples; j++) { root.centroidGene.set( 0, j, root.centroidGene.get(0, j) / numberOfValidGenesInSample[j]); // get a mean root centroid root.left.centroidGene.set(0, j, root.centroidGene.get(0, j)); // assign to children root.right.centroidGene.set(0, j, root.centroidGene.get(0, j)); } // put first value into diversity vector initDivSum = getNodeDiversitySum(root); cycleDiversity.add(new Float(initDivSum)); root.cellDiversity = initDivSum / numberOfGenes; if (useClusterVariance) root.cellVariance = getNodeVariance(root); if (runToMaxCycles) growSOTUnrestricted(); // make tree w/o regard to diversity else growSOT(); // Construct tree // If performing HCL on samples using all genes if (calcFullTreeHCL) { calcFullTreeHCL(); } // Code for HCL clustering if (calcClusterHCL) { calculateClusterHCL(); // calculate HCL trees for SOTA clusters } return inData; // inData has results incorporated }
// sets cell diversities, and variances (if required) private void setDiversities() { SOTACell curr = head; double cellSum = 0; double cellVar = 0; double treeSum = 0; double maxCellDiv = -1; double maxCellVar = -1; int numberOfCells = 0; double currDist = 0; mostDiverseCell = head; mostVariableCell = head; while (curr != null) { numberOfCells++; cellSum = 0; // for all members of the node get distance to set cell resource (diversity) for (int i = 0; i < curr.members.size(); i++) { cellSum += ExperimentUtil.geneDistance( dataMatrix, curr.centroidGene, ((Integer) (curr.members.elementAt(i))).intValue(), 0, function, factor, absolute); } curr.cellDiversity = (cellSum / curr.members.size()); if (curr.cellDiversity > maxCellDiv && curr.members.size() > 1) { maxCellDiv = curr.cellDiversity; mostDiverseCell = curr; } treeSum += cellSum; if (useClusterVariance) { // using cell variance, need to find mostVariable cell cellVar = 0; currDist = 0; // get cell varience // if new members have been added if (curr.changedMembership) { // use max gene to gene distance for (int i = 0; i < curr.members.size(); i++) { for (int j = 0; j < curr.members.size(); j++) { currDist = ExperimentUtil.geneDistance( dataMatrix, null, ((Integer) (curr.members.elementAt(i))).intValue(), ((Integer) (curr.members.elementAt(j))).intValue(), function, factor, absolute); // get max dist. to be cellVar if (currDist > cellVar) { cellVar = currDist; } } } curr.cellVariance = cellVar; } else // no change to membership so we dont hve to recalculate variance cellVar = curr.cellVariance; if (cellVar > maxCellVar && curr.members.size() > 1) { maxCellVar = cellVar; mostVariableCell = curr; } } curr.changedMembership = false; // variance already set for current population curr = curr.succ; } treeDiversity = treeSum; }
// Note that leaves are threaded from left to right. // This means that if displayed top to bottom, centroids would be reversed // Therefore, accumulate in reverse order into AlgorithmData private void getResults() { SOTACell curr = head; int numCells = 0; FloatMatrix centroidFM = new FloatMatrix(numberOfClusters, numberOfSamples); FloatMatrix varianceFM = new FloatMatrix(numberOfClusters, numberOfSamples); int[] clusterSize = new int[numberOfClusters]; FloatMatrix clusterDiversity = new FloatMatrix(numberOfClusters, 1); int numDiv = cycleDiversity.size(); FloatMatrix cycleDivFM = new FloatMatrix(numDiv, 1); int[] clusterOrder = new int[numberOfClusters]; clusters = new Cluster(); NodeList nodeList = clusters.getNodeList(); Node newNode; int[] clusterMembership; int clusterPop; // move to tail while (curr.succ != null) curr = curr.succ; // now curr is at the tail while (numCells <= numberOfClusters && curr != null) { for (int i = 0; i < numberOfSamples; i++) { centroidFM.set(numCells, i, curr.centroidGene.get(0, i)); varianceFM.set(numCells, i, curr.getColumnVar(i)); } clusterPop = curr.members.size(); clusterSize[numCells] = clusterPop; clusterDiversity.set( numCells, 0, (float) curr.cellDiversity * (float) myFactor); // alter poloarity by myFactor based on metric clusterOrder[numCells] = numCells; // accumulate cluster probe indicies clusterMembership = new int[clusterPop]; for (int i = 0; i < clusterPop; i++) { clusterMembership[i] = ((Integer) (curr.members.elementAt(i))).intValue(); } newNode = new Node(); newNode.setProbesIndexes(clusterMembership); nodeList.addNode(newNode); numCells++; curr = curr.pred; } // now accumlate cycle divresity information if (myFactor == 1) { float initDiv = ((Float) (cycleDiversity.elementAt(0))).floatValue(); for (int i = 0; i < numDiv; i++) { cycleDivFM.set(i, 0, (((Float) (cycleDiversity.elementAt(i))).floatValue()) / initDiv); } } else { float lowerLim = numberOfGenes * myFactor; float initDiv = ((Float) (cycleDiversity.elementAt(0))).floatValue() + Math.abs(lowerLim); for (int i = 0; i < numDiv; i++) { cycleDivFM.set( i, 0, (((Float) (cycleDiversity.elementAt(i))).floatValue() + Math.abs(lowerLim)) / initDiv); } } // put all important information into AlgorithmData inData.addParam("cycles", String.valueOf(numberOfClusters)); inData.addCluster("cluster", clusters); inData.addMatrix("centroid-matrix", centroidFM); inData.addMatrix("cluster-variances", varianceFM); inData.addMatrix("cluster-diversity", clusterDiversity); inData.addMatrix("cycle-diversity", cycleDivFM); inData.addIntArray("cluster-population", clusterSize); // Additions to AlgorithmData to allow drawing arrays float[] nodeHeight = new float[numberOfClusters * 2]; int[] nodePopulation = new int[numberOfClusters * 2]; int[] leftChild = new int[nodeHeight.length * 2]; int[] rightChild = new int[nodeHeight.length * 2]; initializeReturnValues(nodeHeight, nodePopulation, leftChild, rightChild); utilCounter = 0; loadReturnValues(root, 0, nodeHeight, nodePopulation, leftChild, rightChild); inData.addMatrix("node-heights", new FloatMatrix(nodeHeight, nodeHeight.length)); inData.addIntArray("left-child", leftChild); inData.addIntArray("right-child", rightChild); inData.addIntArray("node-population", nodePopulation); if (useClusterVariance) inData.addParam("computed-var-cutoff", String.valueOf(endCriteria)); return; }