Пример #1
0
  /**
   * Find duplicates in the passed document list. Every set of duplicates is written under a unique
   * key in the returned map. Whether 2 documents are considered duplicates is determined by {@link
   * DuplicateWorkVoter#isDuplicate(DocumentWrapper, DocumentWrapper)}
   *
   * <p>E.g. let's assume we passed to the method the documents symbolized here as: AAA, BBb, bbb,
   * AAa, aAA, ccc And that: AAA is duplicate of AAa and aAA, and: BBb is duplicate of bbb
   *
   * <p>Then the result of this method will be something like this: <1, <AAA, AAa, aAA>> <2, <BBb,
   * bbb>>
   */
  public Map<Integer, Set<DocumentWrapper>> findDuplicates(List<DocumentWrapper> documents) {
    Map<Integer, Set<DocumentWrapper>> sameWorksMap = Maps.newHashMap();

    List<DocumentWrapper> documentsCopy = Lists.newArrayList(documents);

    int i = 0;
    for (DocumentWrapper document : documents) {

      for (DocumentWrapper other : new ArrayList<DocumentWrapper>(documentsCopy)) {
        if (document.getRowId().equals(other.getRowId())) {
          documentsCopy.remove(other);
        } else {
          if (duplicateWorkVoter.isDuplicate(document, other)) {
            addSameWorks(sameWorksMap, i, document, other);
            documentsCopy.remove(other);
          }
        }
      }
      i++;
    }
    return sameWorksMap;
  }
Пример #2
0
  @Override
  protected void map(Writable key, BytesWritable value, Context context)
      throws IOException, InterruptedException {

    int percentOfWritten = context.getConfiguration().getInt("percentOfWritten", 100);

    DocumentWrapper docWrapper = DocumentProtos.DocumentWrapper.parseFrom(value.copyBytes());

    log.info(
        "work title = "
            + docWrapper.getDocumentMetadata().getBasicMetadata().getTitle(0).getText());

    if ((i % 101) > 100 - percentOfWritten) {
      log.info("writing...");
      context.write(new Text(docWrapper.getRowId()), new BytesWritable(value.copyBytes()));
    }

    i++;
  }