@Override public DataBag exec(Tuple input) throws IOException { if (input == null || input.size() == 0) { return null; } try { DataByteArray dba = null; try { dba = (DataByteArray) input.get(0); } catch (ExecException e) { logger.error("Error in reading field:", e); throw e; } DocumentWrapper dm = null; try { dm = DocumentWrapper.parseFrom(dba.get()); } catch (Exception e) { logger.error("Error in reading ByteArray to DocumentMetadata:", e); throw e; } DataBag ret = new DefaultDataBag(); DataByteArray metadata = new DataByteArray(dm.getDocumentMetadata().toByteArray()); List<Author> authors = dm.getDocumentMetadata().getBasicMetadata().getAuthorList(); for (int i = 0; i < authors.size(); i++) { String sname = authors.get(i).getSurname(); Object[] to = new Object[] {sname, metadata, i}; Tuple t = TupleFactory.getInstance().newTuple(Arrays.asList(to)); ret.add(t); } return ret; } catch (Exception e) { logger.error("Error in processing input row:", e); throw new IOException( "Caught exception processing input row:\n" + StackTraceExtractor.getStackTrace(e)); } }
@Override public void map( Writable key, BytesWritable value, Mapper<Writable, BytesWritable, Text, BytesWritable>.Context context) throws IOException, InterruptedException { DocumentWrapper docWrapper = DocumentProtos.DocumentWrapper.parseFrom(value.copyBytes()); String docKey = keyGen.generateKey(docWrapper.getDocumentMetadata(), 0); if (!docKey.isEmpty()) { DocumentWrapper thinDocWrapper = DocumentWrapperUtils.cloneDocumentMetadata(docWrapper); context.write(new Text(docKey), new BytesWritable(thinDocWrapper.toByteArray())); } }
@Override protected void map(Writable key, BytesWritable value, Context context) throws IOException, InterruptedException { int percentOfWritten = context.getConfiguration().getInt("percentOfWritten", 100); DocumentWrapper docWrapper = DocumentProtos.DocumentWrapper.parseFrom(value.copyBytes()); log.info( "work title = " + docWrapper.getDocumentMetadata().getBasicMetadata().getTitle(0).getText()); if ((i % 101) > 100 - percentOfWritten) { log.info("writing..."); context.write(new Text(docWrapper.getRowId()), new BytesWritable(value.copyBytes())); } i++; }