public double[] evaluate() { // the result will contain MDEF/ SIgmaMDEF the higher this ratio is the // more outling the result is.Lower than or equal 3 is not considered an // outlier double[] result = new double[n]; DistancePair[][] criticalDistances = new DistancePair[n][2 * n]; int secondDimension = 2 * n; // preprocessing for (int i = 0; i < n; i++) { int firstIndex = i << 1; int secondIndex = firstIndex + 1; int current = secondIndex + 1; // cardinality -2 means that there actually a point criticalDistances[i][firstIndex] = new DistancePair(0, -2, i); // cardinality -1 means that there is no point just alpha critical // distance criticalDistances[i][secondIndex] = new DistancePair(0, -1, -1); for (int j = i + 1; j < n; j++) { // draw back this assumes that the distance measure is symmetric double currentDistance = measure.calculateDistance(points[i], points[j]); double alphaCurrentDistance = currentDistance / alpha; criticalDistances[i][current++] = new DistancePair(currentDistance, -2, j); criticalDistances[i][current++] = new DistancePair(alphaCurrentDistance, -1, -1); criticalDistances[j][firstIndex] = new DistancePair(currentDistance, -2, i); criticalDistances[j][secondIndex] = new DistancePair(alphaCurrentDistance, -1, -1); } Arrays.sort(criticalDistances[i]); int cardinality = 0; for (int j = 0; j < secondDimension; j++) { if (criticalDistances[i][j].cardinality == -2) { cardinality += weight[criticalDistances[i][j].index]; } criticalDistances[i][j].cardinality = cardinality; } } // computation of MDEF for (int i = 0; i < n; i++) { result[i] = 0; for (int j = 0; j < secondDimension; j++) { if (criticalDistances[i][j].cardinality < nmin) continue; if (j != secondDimension - 1 && criticalDistances[i][j].distance == criticalDistances[i][j + 1].distance) continue; // alpha r distance double alphaR = criticalDistances[i][j].distance * alpha; int nPR = criticalDistances[i][j].cardinality; int nPRAlpha = find(0, secondDimension, alphaR, criticalDistances[i]); double summationNPRALpha = 0.0; // this is the loop I should try to vanish for (int k = 0; k <= j; k++) { int index = criticalDistances[i][k].index; if (index == -1) continue; int currentNRPAlpa = find(0, secondDimension, alphaR, criticalDistances[index]); summationNPRALpha += currentNRPAlpa; } double nHatPRAlpha = summationNPRALpha * 1.0 / nPR; double squaredNPRAlpha = 0.0; for (int k = 0; k <= j; k++) { int index = criticalDistances[i][k].index; if (index == -1) continue; int currentNRPAlpa = find(0, secondDimension, alphaR, criticalDistances[index]); double delta = currentNRPAlpa - nHatPRAlpha; squaredNPRAlpha += delta * delta; } double sigmaPRAlpha = Math.sqrt(squaredNPRAlpha / nPR); double MDEF = 1.0 - nPRAlpha / nHatPRAlpha; double sigmaMDEF = sigmaPRAlpha / nHatPRAlpha; double currentRes; if (sigmaMDEF == 0) currentRes = 0; else currentRes = MDEF / sigmaMDEF; if (currentRes > result[i]) result[i] = currentRes; } } return result; }
private void addEdges() { // remove old edges if available Iterator<String> e = edgeLabelMap.keySet().iterator(); while (e.hasNext()) { graph.removeEdge(e.next()); } edgeLabelMap.clear(); boolean isDistance = measure.isDistance(); Attribute id = exampleSet.getAttributes().getId(); List<SortableEdge> sortableEdges = new LinkedList<SortableEdge>(); for (int i = 0; i < exampleSet.size(); i++) { Example example = exampleSet.getExample(i); for (int j = i + 1; j < exampleSet.size(); j++) { Example comExample = exampleSet.getExample(j); if (isDistance) sortableEdges.add( new SortableEdge( example.getValueAsString(id), comExample.getValueAsString(id), null, measure.calculateDistance(example, comExample), SortableEdge.DIRECTION_INCREASE)); else sortableEdges.add( new SortableEdge( example.getValueAsString(id), comExample.getValueAsString(id), null, measure.calculateSimilarity(example, comExample), SortableEdge.DIRECTION_DECREASE)); } } Collections.sort(sortableEdges); int numberOfEdges = distanceSlider.getValue(); int counter = 0; double minStrength = Double.POSITIVE_INFINITY; double maxStrength = Double.NEGATIVE_INFINITY; Map<String, Double> strengthMap = new HashMap<String, Double>(); for (SortableEdge sortableEdge : sortableEdges) { if (counter > numberOfEdges) break; String idString = edgeFactory.create(); graph.addEdge( idString, sortableEdge.getFirstVertex(), sortableEdge.getSecondVertex(), EdgeType.UNDIRECTED); edgeLabelMap.put(idString, Tools.formatIntegerIfPossible(sortableEdge.getEdgeValue())); double strength = sortableEdge.getEdgeValue(); minStrength = Math.min(minStrength, strength); maxStrength = Math.max(maxStrength, strength); strengthMap.put(idString, strength); counter++; } for (Entry<String, Double> entry : strengthMap.entrySet()) { edgeStrengthMap.put( entry.getKey(), (entry.getValue() - minStrength) / (maxStrength - minStrength)); } }
@Override public void doWork() throws OperatorException { ExampleSet exampleSet = exampleSetInput.getData(ExampleSet.class); DistanceMeasure measure = measureHelper.getInitializedMeasure(exampleSet); // additional checks Tools.onlyNonMissingValues(exampleSet, getOperatorClassName(), this, new String[0]); Tools.checkAndCreateIds(exampleSet); Attribute idAttribute = exampleSet.getAttributes().getId(); boolean idAttributeIsNominal = idAttribute.isNominal(); DistanceMatrix matrix = new DistanceMatrix(exampleSet.size()); Map<Integer, HierarchicalClusterNode> clusterMap = new HashMap<Integer, HierarchicalClusterNode>(exampleSet.size()); int[] clusterIds = new int[exampleSet.size()]; // filling the distance matrix int nextClusterId = 0; for (Example example1 : exampleSet) { checkForStop(); clusterIds[nextClusterId] = nextClusterId; int y = 0; for (Example example2 : exampleSet) { if (y > nextClusterId) { matrix.set(nextClusterId, y, measure.calculateDistance(example1, example2)); } y++; } if (idAttributeIsNominal) { clusterMap.put( nextClusterId, new HierarchicalClusterLeafNode(nextClusterId, example1.getValueAsString(idAttribute))); } else { clusterMap.put( nextClusterId, new HierarchicalClusterLeafNode(nextClusterId, example1.getValue(idAttribute))); } nextClusterId++; } // creating linkage method AbstractLinkageMethod linkage = new SingleLinkageMethod(matrix, clusterIds); if (getParameterAsString(PARAMETER_MODE).equals(modes[1])) { linkage = new CompleteLinkageMethod(matrix, clusterIds); } else if (getParameterAsString(PARAMETER_MODE).equals(modes[2])) { linkage = new AverageLinkageMethod(matrix, clusterIds); } // now building agglomerative tree bottom up while (clusterMap.size() > 1) { Agglomeration agglomeration = linkage.getNextAgglomeration(nextClusterId, clusterMap); HierarchicalClusterNode newNode = new HierarchicalClusterNode(nextClusterId, agglomeration.getDistance()); newNode.addSubNode(clusterMap.get(agglomeration.getClusterId1())); newNode.addSubNode(clusterMap.get(agglomeration.getClusterId2())); clusterMap.remove(agglomeration.getClusterId1()); clusterMap.remove(agglomeration.getClusterId2()); clusterMap.put(nextClusterId, newNode); nextClusterId++; } // creating model HierarchicalClusterModel model = new DendogramHierarchicalClusterModel(clusterMap.entrySet().iterator().next().getValue()); // registering visualizer ObjectVisualizerService.addObjectVisualizer( model, new ExampleVisualizer((ExampleSet) exampleSet.clone())); modelOutput.deliver(model); exampleSetOutput.deliver(exampleSet); }
/* * calculate max score for each point */ private void calculateAllScores() { TreeNode[] counting = new TreeNode[grids.length]; TreeNode[] sampling = new TreeNode[grids.length]; double[][] countingCenter = new double[grids.length][dimensions]; double[][] samplingCenter = new double[grids.length][dimensions]; for (int p = 0; p < points.length; ++p) { for (int g = 0; g < grids.length; ++g) { counting[g] = root[g]; sampling[g] = root[g]; countingCenter[g] = createPoint(dimensions, Rp / 2); samplingCenter[g] = createPoint(dimensions, Rp / 2); } double countingRadius = Rp; double samplingRadius = Rp; for (int level = 0; level < alpha; ++level) { countingRadius /= 2; for (int g = 0; g < grids.length; ++g) { int index = cellFinder(points[p], countingCenter[g], grids[g], countingRadius / 2); counting[g] = counting[g].getChild(index); } } for (int level = alpha; level <= levels; ++level) { double dist = Double.MAX_VALUE; int cellIndex = -1; for (int g = 0; g < grids.length; ++g) { double newDistance = measure.calculateDistance(move(points[p], grids[g], true), countingCenter[g]); if (newDistance < dist) { dist = newDistance; cellIndex = g; } } dist = Double.MAX_VALUE; int cellIndex2 = -1; for (int g = 0; g < grids.length; ++g) { double newDistance = measure.calculateDistance( move(samplingCenter[g], grids[g], false), move(countingCenter[cellIndex], grids[cellIndex], false)); if (newDistance < dist) { dist = newDistance; cellIndex2 = g; } } countingRadius /= 2; samplingRadius /= 2; calculateScore( counting[cellIndex], sampling[cellIndex2], p, level, samplingCenter[cellIndex2]); if (level < levels) for (int g = 0; g < grids.length; ++g) { int nextChild = cellFinder(points[p], countingCenter[g], grids[g], countingRadius / 2); counting[g] = counting[g].getChild(nextChild); nextChild = cellFinder(points[p], samplingCenter[g], grids[g], samplingRadius / 2); sampling[g] = sampling[g].getChild(nextChild); } } } }
@Override public ExampleSet apply(ExampleSet exampleSet) throws OperatorException { // creating kernel and settings from Parameters int k = Math.min(100, exampleSet.getAttributes().size() * 2); int size = exampleSet.size(); switch (getParameterAsInt(PARAMETER_SAMPLE)) { case SAMPLE_ABSOLUTE: size = getParameterAsInt(PARAMETER_SAMPLE_SIZE); break; case SAMPLE_RELATIVE: size = (int) Math.round(exampleSet.size() * getParameterAsDouble(PARAMETER_SAMPLE_RATIO)); break; } DistanceMeasure distanceMeasure = new EuclideanDistance(); distanceMeasure.init(exampleSet); // finding farthest and nearest example to mean Vector double[] meanVector = getMeanVector(exampleSet); Candidate min = new Candidate(meanVector, Double.POSITIVE_INFINITY, 0); Candidate max = new Candidate(meanVector, Double.NEGATIVE_INFINITY, 0); int i = 0; for (Example example : exampleSet) { double[] exampleValues = getExampleValues(example); Candidate current = new Candidate( exampleValues, Math.abs(distanceMeasure.calculateDistance(meanVector, exampleValues)), i); if (current.compareTo(min) < 0) { min = current; } if (current.compareTo(max) > 0) { max = current; } i++; } ArrayList<Candidate> recentlySelected = new ArrayList<Candidate>(10); int[] partition = new int[exampleSet.size()]; int numberOfSelectedExamples = 2; recentlySelected.add(min); recentlySelected.add(max); partition[min.getExampleIndex()] = 1; partition[max.getExampleIndex()] = 1; double[] minimalDistances = new double[exampleSet.size()]; Arrays.fill(minimalDistances, Double.POSITIVE_INFINITY); // running now through examples, checking for smallest distance to one of the candidates while (numberOfSelectedExamples < size) { TreeSet<Candidate> candidates = new TreeSet<Candidate>(); i = 0; // check distance only for candidates recently selected. for (Example example : exampleSet) { // if example not has been selected allready if (partition[i] == 0) { double[] exampleValues = getExampleValues(example); for (Candidate candidate : recentlySelected) { minimalDistances[i] = Math.min( minimalDistances[i], Math.abs( distanceMeasure.calculateDistance(exampleValues, candidate.getValues()))); } Candidate newCandidate = new Candidate(exampleValues, minimalDistances[i], i); candidates.add(newCandidate); if (candidates.size() > k) { Iterator<Candidate> iterator = candidates.iterator(); iterator.next(); iterator.remove(); } } i++; } // clearing recently selected since now new ones will be selected recentlySelected.clear(); // now running in descending order through candidates and adding to selected // IM: descendingIterator() is not available in Java versions less than 6 !!! // IM: Bad workaround for now by adding all candidates into a list and using a listIterator() // and hasPrevious... /* Iterator<Candidate> descendingIterator = candidates.descendingIterator(); while (descendingIterator.hasNext() && numberOfSelectedExamples < desiredNumber) { Candidate candidate = descendingIterator.next(); */ List<Candidate> reverseCandidateList = new LinkedList<Candidate>(); Iterator<Candidate> it = candidates.iterator(); while (it.hasNext()) { reverseCandidateList.add(it.next()); } ListIterator<Candidate> lit = reverseCandidateList.listIterator(reverseCandidateList.size() - 1); while (lit.hasPrevious()) { Candidate candidate = lit.previous(); // IM: end of workaround boolean existSmallerDistance = false; Iterator<Candidate> addedIterator = recentlySelected.iterator(); // test if a distance to recently selected is smaller than previously calculated minimal // distance // if one exists: This is not selected while (addedIterator.hasNext()) { double distance = Math.abs( distanceMeasure.calculateDistance( addedIterator.next().getValues(), candidate.getValues())); existSmallerDistance = existSmallerDistance || distance < candidate.getDistance(); } if (!existSmallerDistance) { recentlySelected.add(candidate); partition[candidate.getExampleIndex()] = 1; numberOfSelectedExamples++; } else break; } } // building new exampleSet containing only Examples with indices in selectedExamples SplittedExampleSet sample = new SplittedExampleSet(exampleSet, new Partition(partition, 2)); sample.selectSingleSubset(1); return sample; }