private static int getDocumentCount(String term, Cluster<DocumentDataElement> cluster) { int count = 0; for (DocumentDataElement elem : cluster.getDataElements()) { if (elem.getDocument().getTermCount(term) > 0) { count++; } } return count; }
public static void assignLabels( List<Cluster<DocumentDataElement>> clusterData, DocumentDataSet dataSet) { for (Cluster<DocumentDataElement> cluster : clusterData) { MinMaxPriorityQueue<TermEntry> queue = MinMaxPriorityQueue.orderedBy( new Comparator<TermEntry>() { @Override public int compare(TermEntry o1, TermEntry o2) { return -Double.compare(o1.getScore(), o2.getScore()); } }) .maximumSize(5) .create(); DocumentCollection localCollection = new DocumentCollection(); for (DocumentDataElement elem : cluster.getDataElements()) { localCollection.addDocument(elem.getDocument()); } DocumentVSMGenerator docToVsm = new TFIDF(); DocumentDataSet clusterDataSet = docToVsm.createVSM(localCollection); // TODO remove this try { CSVDataSetExporter.export(clusterDataSet, new File("tmp/" + cluster.getLabel() + ".csv")); } catch (IOException e) { } for (DocumentDataElement elem : clusterDataSet.elements()) { Document document = elem.getDocument(); for (String term : document.getAllTerms()) { double termWeight = clusterDataSet.getTermWeight(document.getId(), term); queue.offer(new TermEntry(term, termWeight * getDocumentCount(term, cluster))); } } String label = ""; StringBuilder labelBuilder = new StringBuilder(); TreeSet<String> words = Sets.newTreeSet(); // TODO this is a debug version of labels for (TermEntry termEntry : queue) { labelBuilder .append(termEntry.getTerm()) .append(":") .append(String.format("%7.5f", termEntry.getScore())) .append(";") .append(getDocumentCount(termEntry.getTerm(), cluster)) .append(","); words.add(termEntry.getTerm()); } if (labelBuilder.length() > 0) { label = labelBuilder.substring(0, labelBuilder.length() - 1); } cluster.setLabel(words.toString()); } }