/** * Find duplicates in the passed document list. Every set of duplicates is written under a unique * key in the returned map. Whether 2 documents are considered duplicates is determined by {@link * DuplicateWorkVoter#isDuplicate(DocumentWrapper, DocumentWrapper)} * * <p>E.g. let's assume we passed to the method the documents symbolized here as: AAA, BBb, bbb, * AAa, aAA, ccc And that: AAA is duplicate of AAa and aAA, and: BBb is duplicate of bbb * * <p>Then the result of this method will be something like this: <1, <AAA, AAa, aAA>> <2, <BBb, * bbb>> */ public Map<Integer, Set<DocumentWrapper>> findDuplicates(List<DocumentWrapper> documents) { Map<Integer, Set<DocumentWrapper>> sameWorksMap = Maps.newHashMap(); List<DocumentWrapper> documentsCopy = Lists.newArrayList(documents); int i = 0; for (DocumentWrapper document : documents) { for (DocumentWrapper other : new ArrayList<DocumentWrapper>(documentsCopy)) { if (document.getRowId().equals(other.getRowId())) { documentsCopy.remove(other); } else { if (duplicateWorkVoter.isDuplicate(document, other)) { addSameWorks(sameWorksMap, i, document, other); documentsCopy.remove(other); } } } i++; } return sameWorksMap; }
@Override protected void map(Writable key, BytesWritable value, Context context) throws IOException, InterruptedException { int percentOfWritten = context.getConfiguration().getInt("percentOfWritten", 100); DocumentWrapper docWrapper = DocumentProtos.DocumentWrapper.parseFrom(value.copyBytes()); log.info( "work title = " + docWrapper.getDocumentMetadata().getBasicMetadata().getTitle(0).getText()); if ((i % 101) > 100 - percentOfWritten) { log.info("writing..."); context.write(new Text(docWrapper.getRowId()), new BytesWritable(value.copyBytes())); } i++; }