private static InvertedListPointer writeToInvertedList( DictionaryTerm term, InvertedListBuilder output, long totalDocuments, Map<Long, Float> tfIdfAccumulator, Pair<Pair<DictionaryTerm, InvertedListPointer>, InvertedList>... sources) throws IOException, InterruptedException { Iterator<DocumentOccurence>[] documentIterators = new Iterator[sources.length]; int position = 0; long termDocs = 0; for (Pair<Pair<DictionaryTerm, InvertedListPointer>, InvertedList> source : sources) { Pair<Iterator<DocumentOccurence>, Long> result = source .getSecond() .lookupTerm(source.getFirst().getFirst(), source.getFirst().getSecond()); termDocs += result.getSecond(); documentIterators[position++] = result.getFirst(); } float idf = (float) totalDocuments / (float) termDocs; Iterator<DocumentOccurence> mergedDocuments = new IteratorMerger<DocumentOccurence>(documentIterators); Iterator<Collection<DocumentOccurence>> collectedDocuments = new DuplicateCollectingIterator<DocumentOccurence>(DOC_OCC_COMP, mergedDocuments); InvertedListPointer start = output.nextTerm(term); while (collectedDocuments.hasNext()) { position = 0; Collection<DocumentOccurence> docs = collectedDocuments.next(); Iterator<Integer>[] positionLists = new Iterator[docs.size()]; long docId = -1; for (DocumentOccurence docOcc : docs) { if (docId == -1) { docId = docOcc.getDocumentId(); } else { assert docId == docOcc.getDocumentId(); } positionLists[position++] = docOcc.getPositionList().iterator(); } output.nextDocument(docId); Iterator<Integer> posList = new IteratorMerger<Integer>(positionLists); long termFrequency = 0; while (posList.hasNext()) { output.nextOccurence(posList.next()); termFrequency++; } Float oldWeight = tfIdfAccumulator.get(docId); tfIdfAccumulator.put( docId, (oldWeight == null ? 0 : oldWeight) + (float) Math.pow(idf * termFrequency, 2.0)); } return start; }
public int compare(DocumentOccurence o1, DocumentOccurence o2) { return o1.getDocumentId() == o2.getDocumentId() ? 0 : 1; // only used for equality ok to break contract. }