@Override public void map( Writable key, BytesWritable value, Mapper<Writable, BytesWritable, Text, BytesWritable>.Context context) throws IOException, InterruptedException { DocumentWrapper docWrapper = DocumentProtos.DocumentWrapper.parseFrom(value.copyBytes()); String docKey = keyGen.generateKey(docWrapper.getDocumentMetadata(), 0); if (!docKey.isEmpty()) { DocumentWrapper thinDocWrapper = DocumentWrapperUtils.cloneDocumentMetadata(docWrapper); context.write(new Text(docKey), new BytesWritable(thinDocWrapper.toByteArray())); } }
@Override public DataBag exec(Tuple input) throws IOException { if (input == null || input.size() == 0) { return null; } try { DataByteArray dba = null; try { dba = (DataByteArray) input.get(0); } catch (ExecException e) { logger.error("Error in reading field:", e); throw e; } DocumentWrapper dm = null; try { dm = DocumentWrapper.parseFrom(dba.get()); } catch (Exception e) { logger.error("Error in reading ByteArray to DocumentMetadata:", e); throw e; } DataBag ret = new DefaultDataBag(); DataByteArray metadata = new DataByteArray(dm.getDocumentMetadata().toByteArray()); List<Author> authors = dm.getDocumentMetadata().getBasicMetadata().getAuthorList(); for (int i = 0; i < authors.size(); i++) { String sname = authors.get(i).getSurname(); Object[] to = new Object[] {sname, metadata, i}; Tuple t = TupleFactory.getInstance().newTuple(Arrays.asList(to)); ret.add(t); } return ret; } catch (Exception e) { logger.error("Error in processing input row:", e); throw new IOException( "Caught exception processing input row:\n" + StackTraceExtractor.getStackTrace(e)); } }
@Override protected void map(Writable key, BytesWritable value, Context context) throws IOException, InterruptedException { int percentOfWritten = context.getConfiguration().getInt("percentOfWritten", 100); DocumentWrapper docWrapper = DocumentProtos.DocumentWrapper.parseFrom(value.copyBytes()); log.info( "work title = " + docWrapper.getDocumentMetadata().getBasicMetadata().getTitle(0).getText()); if ((i % 101) > 100 - percentOfWritten) { log.info("writing..."); context.write(new Text(docWrapper.getRowId()), new BytesWritable(value.copyBytes())); } i++; }
/** * Find duplicates in the passed document list. Every set of duplicates is written under a unique * key in the returned map. Whether 2 documents are considered duplicates is determined by {@link * DuplicateWorkVoter#isDuplicate(DocumentWrapper, DocumentWrapper)} * * <p>E.g. let's assume we passed to the method the documents symbolized here as: AAA, BBb, bbb, * AAa, aAA, ccc And that: AAA is duplicate of AAa and aAA, and: BBb is duplicate of bbb * * <p>Then the result of this method will be something like this: <1, <AAA, AAa, aAA>> <2, <BBb, * bbb>> */ public Map<Integer, Set<DocumentWrapper>> findDuplicates(List<DocumentWrapper> documents) { Map<Integer, Set<DocumentWrapper>> sameWorksMap = Maps.newHashMap(); List<DocumentWrapper> documentsCopy = Lists.newArrayList(documents); int i = 0; for (DocumentWrapper document : documents) { for (DocumentWrapper other : new ArrayList<DocumentWrapper>(documentsCopy)) { if (document.getRowId().equals(other.getRowId())) { documentsCopy.remove(other); } else { if (duplicateWorkVoter.isDuplicate(document, other)) { addSameWorks(sameWorksMap, i, document, other); documentsCopy.remove(other); } } } i++; } return sameWorksMap; }
@Override public Tuple exec(Tuple input) throws IOException { myreporter = PigStatusReporter.getInstance(); if (input == null || input.size() == 0) { return null; } try { DataByteArray dba = null; DocumentMetadata dm = null; String title = null; String doi = null; String year = null; try { dba = (DataByteArray) input.get(0); } catch (Exception e) { myreporter.getCounter("extraction problems", "DataByteArray from tuple"); return null; } try { dm = DocumentWrapper.parseFrom(dba.get()).getDocumentMetadata(); } catch (Exception e) { myreporter.getCounter("extraction problems", "document metadata"); return null; } try { for (TextWithLanguage twl : dm.getBasicMetadata().getTitleList()) { if (twl.getLanguage().toLowerCase().startsWith("en")) { title = twl.getText(); break; } } if (title == null) { title = dm.getBasicMetadata().getTitle(0).getText(); } if (title != null && !title.trim().isEmpty()) { title = DiacriticsRemover.removeDiacritics(title); title = title.replaceAll("[^A-Za-z0-9\\-_]", " ").replaceAll("\\s++", " ").trim(); } } catch (Exception e) { } finally { if (title == null || title.trim().isEmpty()) { myreporter.getCounter("extraction problems", "title extraction"); return null; } } try { doi = dm.getBasicMetadata().getDoi().replaceAll("\\s++", " ").trim(); } catch (Exception e) { } finally { if (doi == null || doi.trim().isEmpty()) { myreporter.getCounter("extraction problems", "doi extraction"); return null; } } try { year = dm.getBasicMetadata().getYear().replaceAll("\\s++", " ").trim(); } catch (Exception e) { } finally { if (year == null || year.trim().isEmpty()) { myreporter.getCounter("extraction problems", "year extraction"); return null; } } Tuple t = TupleFactory.getInstance().newTuple(); t.append(doi); t.append(year); t.append(title); return t; } catch (Exception e) { logger.debug(StackTraceExtractor.getStackTrace(e)); throw new IOException(e); } }