private ListMap sortNodesByAttributePrefix( Table nodeTable, String compareAttributeName, int numPrefixLetters) { ListMap nodesByAttributePrefix = new ListMap(); // for each node in the node table... for (IntIterator nodeIndexIt = nodeTable.rows(); nodeIndexIt.hasNext(); ) { int nodeIndex = nodeIndexIt.nextInt(); Tuple row = nodeTable.getTuple(nodeIndex); // get the attribute contents String comparisonAttributeContents = row.getString(compareAttributeName); if (comparisonAttributeContents == null) continue; // add the node index to our list map, in the bin with a key made from a prefix of the // attribute e.g. "do" for "donkey" String prefixKey = extractPrefixKey(comparisonAttributeContents, numPrefixLetters); nodesByAttributePrefix.put(prefixKey, new Integer(nodeIndex)); } return nodesByAttributePrefix; }
private Graph makeMergeGraph(Table nodeTable, StringBuffer noteLog) { Graph mergeGraph = makeEmptyMergeGraph(nodeTable); // for each group of nodes with a common attribute prefix... ListMap groupedNodes = sortNodesByAttributePrefix(nodeTable, this.compareAttributeName, this.numPrefixLetters); for (Iterator groupIt = groupedNodes.values().iterator(); groupIt.hasNext(); ) { List nodeGroup = (List) groupIt.next(); // for each pair of nodes in the group... for (int i = 0; i < nodeGroup.size(); i++) { Integer firstNodeIndex = (Integer) nodeGroup.get(i); for (int j = i; j < nodeGroup.size(); j++) { Integer secondNodeIndex = (Integer) nodeGroup.get(j); // test how similar the two nodes are float similarity = compareNodesCaseInsensitiveBy( this.compareAttributeName, firstNodeIndex, secondNodeIndex, nodeTable); // if their similarity is high enough to merge... if (similarity >= this.mergeOnSimilarity) { // link the nodes in the merge graph mergeGraph.addEdge(firstNodeIndex.intValue(), secondNodeIndex.intValue()); } // else if their similarity is noteworthy... else if (similarity >= this.makeNoteOnSimilarity) { // record it in the log String nodeOneAttribute = (String) nodeTable.getString(firstNodeIndex.intValue(), this.compareAttributeName); String nodeTwoAttribute = (String) nodeTable.getString(secondNodeIndex.intValue(), this.compareAttributeName); noteLog.append("" + similarity + " similar:" + "\r\n"); noteLog.append(" \"" + nodeOneAttribute + "\"" + "\r\n"); noteLog.append(" \"" + nodeTwoAttribute + "\"" + "\r\n"); } } } } return mergeGraph; }