@Override
  public void map(
      Writable key,
      BytesWritable value,
      Mapper<Writable, BytesWritable, Text, BytesWritable>.Context context)
      throws IOException, InterruptedException {

    DocumentWrapper docWrapper = DocumentProtos.DocumentWrapper.parseFrom(value.copyBytes());

    String docKey = keyGen.generateKey(docWrapper.getDocumentMetadata(), 0);

    if (!docKey.isEmpty()) {
      DocumentWrapper thinDocWrapper = DocumentWrapperUtils.cloneDocumentMetadata(docWrapper);
      context.write(new Text(docKey), new BytesWritable(thinDocWrapper.toByteArray()));
    }
  }
  @Override
  public DataBag exec(Tuple input) throws IOException {

    if (input == null || input.size() == 0) {
      return null;
    }

    try {
      DataByteArray dba = null;
      try {
        dba = (DataByteArray) input.get(0);
      } catch (ExecException e) {
        logger.error("Error in reading field:", e);
        throw e;
      }

      DocumentWrapper dm = null;
      try {
        dm = DocumentWrapper.parseFrom(dba.get());
      } catch (Exception e) {
        logger.error("Error in reading ByteArray to DocumentMetadata:", e);
        throw e;
      }

      DataBag ret = new DefaultDataBag();
      DataByteArray metadata = new DataByteArray(dm.getDocumentMetadata().toByteArray());

      List<Author> authors = dm.getDocumentMetadata().getBasicMetadata().getAuthorList();

      for (int i = 0; i < authors.size(); i++) {
        String sname = authors.get(i).getSurname();
        Object[] to = new Object[] {sname, metadata, i};
        Tuple t = TupleFactory.getInstance().newTuple(Arrays.asList(to));
        ret.add(t);
      }

      return ret;

    } catch (Exception e) {
      logger.error("Error in processing input row:", e);
      throw new IOException(
          "Caught exception processing input row:\n" + StackTraceExtractor.getStackTrace(e));
    }
  }
Пример #3
0
  @Override
  protected void map(Writable key, BytesWritable value, Context context)
      throws IOException, InterruptedException {

    int percentOfWritten = context.getConfiguration().getInt("percentOfWritten", 100);

    DocumentWrapper docWrapper = DocumentProtos.DocumentWrapper.parseFrom(value.copyBytes());

    log.info(
        "work title = "
            + docWrapper.getDocumentMetadata().getBasicMetadata().getTitle(0).getText());

    if ((i % 101) > 100 - percentOfWritten) {
      log.info("writing...");
      context.write(new Text(docWrapper.getRowId()), new BytesWritable(value.copyBytes()));
    }

    i++;
  }
Пример #4
0
  /**
   * Find duplicates in the passed document list. Every set of duplicates is written under a unique
   * key in the returned map. Whether 2 documents are considered duplicates is determined by {@link
   * DuplicateWorkVoter#isDuplicate(DocumentWrapper, DocumentWrapper)}
   *
   * <p>E.g. let's assume we passed to the method the documents symbolized here as: AAA, BBb, bbb,
   * AAa, aAA, ccc And that: AAA is duplicate of AAa and aAA, and: BBb is duplicate of bbb
   *
   * <p>Then the result of this method will be something like this: <1, <AAA, AAa, aAA>> <2, <BBb,
   * bbb>>
   */
  public Map<Integer, Set<DocumentWrapper>> findDuplicates(List<DocumentWrapper> documents) {
    Map<Integer, Set<DocumentWrapper>> sameWorksMap = Maps.newHashMap();

    List<DocumentWrapper> documentsCopy = Lists.newArrayList(documents);

    int i = 0;
    for (DocumentWrapper document : documents) {

      for (DocumentWrapper other : new ArrayList<DocumentWrapper>(documentsCopy)) {
        if (document.getRowId().equals(other.getRowId())) {
          documentsCopy.remove(other);
        } else {
          if (duplicateWorkVoter.isDuplicate(document, other)) {
            addSameWorks(sameWorksMap, i, document, other);
            documentsCopy.remove(other);
          }
        }
      }
      i++;
    }
    return sameWorksMap;
  }
Пример #5
0
  @Override
  public Tuple exec(Tuple input) throws IOException {

    myreporter = PigStatusReporter.getInstance();

    if (input == null || input.size() == 0) {
      return null;
    }

    try {
      DataByteArray dba = null;
      DocumentMetadata dm = null;
      String title = null;
      String doi = null;
      String year = null;

      try {
        dba = (DataByteArray) input.get(0);
      } catch (Exception e) {
        myreporter.getCounter("extraction problems", "DataByteArray from tuple");
        return null;
      }

      try {
        dm = DocumentWrapper.parseFrom(dba.get()).getDocumentMetadata();
      } catch (Exception e) {
        myreporter.getCounter("extraction problems", "document metadata");
        return null;
      }

      try {
        for (TextWithLanguage twl : dm.getBasicMetadata().getTitleList()) {
          if (twl.getLanguage().toLowerCase().startsWith("en")) {
            title = twl.getText();

            break;
          }
        }
        if (title == null) {
          title = dm.getBasicMetadata().getTitle(0).getText();
        }
        if (title != null && !title.trim().isEmpty()) {
          title = DiacriticsRemover.removeDiacritics(title);
          title = title.replaceAll("[^A-Za-z0-9\\-_]", " ").replaceAll("\\s++", " ").trim();
        }
      } catch (Exception e) {
      } finally {
        if (title == null || title.trim().isEmpty()) {
          myreporter.getCounter("extraction problems", "title extraction");
          return null;
        }
      }

      try {
        doi = dm.getBasicMetadata().getDoi().replaceAll("\\s++", " ").trim();
      } catch (Exception e) {
      } finally {
        if (doi == null || doi.trim().isEmpty()) {
          myreporter.getCounter("extraction problems", "doi extraction");
          return null;
        }
      }

      try {
        year = dm.getBasicMetadata().getYear().replaceAll("\\s++", " ").trim();
      } catch (Exception e) {
      } finally {
        if (year == null || year.trim().isEmpty()) {
          myreporter.getCounter("extraction problems", "year extraction");
          return null;
        }
      }

      Tuple t = TupleFactory.getInstance().newTuple();
      t.append(doi);
      t.append(year);
      t.append(title);

      return t;
    } catch (Exception e) {
      logger.debug(StackTraceExtractor.getStackTrace(e));
      throw new IOException(e);
    }
  }