@Override public boolean next(Text key, MapWritable value) throws IOException { if (result == null) { result = client.query(query); } boolean hasNext = result.hasNext(); if (!hasNext) { return false; } Map<String, Object> next = result.next(); // we save the key as is since under the old API, we don't have to create a new Text() object currentKey = next.get("_id").toString(); currentValue = (MapWritable) WritableUtils.toWritable(next.get("_source")); if (key != null) { key.set(currentKey); } if (value != null) { value.clear(); value.putAll(currentValue); } // keep on counting read++; return true; }
public static Collection<Bucket> getBuckets(MapWritable key) { Collection<Bucket> bucketList = new ArrayList<Bucket>(); MapWritable bucketMap = (MapWritable) key.get(MAPWRITABLE_BUCKET_KEY); for (Writable w : bucketMap.values()) { bucketList.add((Bucket) w); } return bucketList; }
public MapWritable getBucketAsKey(BucketStripped bucketStripped) throws IOException { MapWritable mw = new MapWritable(); MapWritable bucketMap = new MapWritable(); Bucket bucket = getBucket(bucketStripped); bucketMap.put(new IntWritable(bucket.hashCode()), bucket); mw.put(MAPWRITABLE_BUCKET_KEY, bucketMap); return mw; }
@Override public void remove() throws IOException { MapWritable msg = new MapWritable(); msg.put(GraphJobRunner.FLAG_VERTEX_DECREASE, this.vertexID); // Get master task peer. String destPeer = GraphJobRunner.getMasterTask(this.getPeer()); runner.getPeer().send(destPeer, new GraphJobMessage(msg)); alterVertexCounter(-1); }
public MapWritable toMapWritableKey() { MapWritable mw = new MapWritable(); MapWritable bucketMap = new MapWritable(); for (IntWritable key : bucketCache.keySet()) { bucketMap.put(key, bucketCache.get(key)); } mw.put(MAPWRITABLE_BUCKET_KEY, bucketMap); return mw; }
@Override public void addVertex(V vertexID, List<Edge<V, E>> edges, M value) throws IOException { MapWritable msg = new MapWritable(); // Create the new vertex. Vertex<V, E, M> vertex = GraphJobRunner.<V, E, M>newVertexInstance(GraphJobRunner.VERTEX_CLASS); vertex.setEdges(edges); vertex.setValue(value); vertex.setVertexID(vertexID); msg.put(GraphJobRunner.FLAG_VERTEX_INCREASE, vertex); runner.getPeer().send(runner.getHostName(vertexID), new GraphJobMessage(msg)); alterVertexCounter(1); }
@Override public Writable serialize(final Object obj, final ObjectInspector inspector) throws SerDeException { final StructObjectInspector structInspector = (StructObjectInspector) inspector; final List<? extends StructField> fields = structInspector.getAllStructFieldRefs(); if (fields.size() != columnNames.size()) { throw new SerDeException( String.format("Required %d columns, received %d.", columnNames.size(), fields.size())); } cachedWritable.clear(); for (int c = 0; c < fieldCount; c++) { StructField structField = fields.get(c); LOG.debug("fieldId=" + c + ",structField=" + structField.toString()); if (structField != null) { final Object field = structInspector.getStructFieldData(obj, fields.get(c)); final AbstractPrimitiveObjectInspector fieldOI = (AbstractPrimitiveObjectInspector) fields.get(c).getFieldObjectInspector(); Writable value = (Writable) fieldOI.getPrimitiveWritableObject(field); if (value == null) { continue; } LOG.debug("fieldCount=" + fieldCount + ",value=" + value.toString()); if (value instanceof IntWritable) { cachedWritable.put(new Text(columnNames.get(c)), value); } else if (value instanceof Text) { cachedWritable.put(new Text(columnNames.get(c)), ((Text) value)); } else if (value instanceof LongWritable) { cachedWritable.put(new Text(columnNames.get(c)), ((LongWritable) value)); } else if (value instanceof DoubleWritable) { cachedWritable.put(new Text(columnNames.get(c)), ((DoubleWritable) value)); } else if (value instanceof FloatWritable) { cachedWritable.put(new Text(columnNames.get(c)), ((FloatWritable) value)); } else if (value instanceof BooleanWritable) { cachedWritable.put(new Text(columnNames.get(c)), ((BooleanWritable) value)); } else if (value instanceof ByteWritable) { cachedWritable.put(new Text(columnNames.get(c)), ((ByteWritable) value)); } else if (value instanceof BytesWritable) { cachedWritable.put(new Text(columnNames.get(c)), ((BytesWritable) value)); } else { LOG.warn("fieldCount=" + fieldCount + ",type=" + value.getClass().getName()); } } } return cachedWritable; }
@Override public void write(Writable w) throws IOException { MapWritable map = (MapWritable) w; BasicDBObject dbo = new BasicDBObject(); for (final Map.Entry<Writable, Writable> entry : map.entrySet()) { // System.err.println("Write: key=" + entry.getKey().toString() // + ", val=" + entry.getValue().toString()); String key = entry.getKey().toString(); // if ("id".equals(key)) { // key = "_id"; // } dbo.put(key, getObjectFromWritable(entry.getValue())); } table.save(dbo); }
@Override protected void map(Text key, DocumentMapping value, Context context) throws IOException, InterruptedException { // no need vector for this MR double distance = value.getDistance(); Vector vector = value.getVector(); // map output MapWritable m = new MapWritable(); m.put(ClusterEvaluatorMR.DOC_VECTOR_KEY, new VectorWritable(vector)); m.put(ClusterEvaluatorMR.DISTANCE_KEY, new DoubleWritable(distance)); m.put(ClusterEvaluatorMR.SQUARED_DISTANCE_KEY, new DoubleWritable(distance * distance)); m.put(ClusterEvaluatorMR.COUNT_KEY, new LongWritable(1)); context.write(key, m); }
private DocumentMetadata getAllMetadata(MapWritable map) { DocumentMetadata metadata = new DocumentMetadata(); Set<Writable> set = map.keySet(); Iterator<Writable> iter = set.iterator(); while (iter.hasNext()) { String name = iter.next().toString(); if (!ParameterProcessing.NATIVE.equals(name) && !ParameterProcessing.NATIVE_AS_PDF.equals(name) && !name.startsWith( ParameterProcessing.NATIVE_AS_HTML)) { // all metadata but native - which is bytes! Text value = (Text) map.get(new Text(name)); metadata.set(name, value.toString()); } } return metadata; }
public void reduce(Text key, Iterable<TermFrequencyWritable> values, Context context) throws IOException, InterruptedException { HashMap<Text, IntWritable> map = new HashMap<Text, IntWritable>(); for (TermFrequencyWritable val : values) { Text docID = new Text(val.getDocumentID()); int freq = val.getFreq().get(); if (map.get(docID) != null) { map.put(docID, new IntWritable(map.get(docID).get() + freq)); } else { map.put(docID, new IntWritable(freq)); } } MapWritable outputMap = new MapWritable(); outputMap.putAll(map); context.write(key, outputMap); }
public MapWritable readMap(MapWritable mw) throws IOException { if (mw == null) { mw = new MapWritable(); } int length = in.readMapHeader(); for (int i = 0; i < length; i++) { Writable key = read(); Writable value = read(); mw.put(key, value); } return mw; }
public SequenceFileExportMapper() { Fields.put(new Text("full_path"), FullPath); Fields.put(new Text("extension"), Ext); Fields.put(new Text("sha1"), Sha); Fields.put(new Text("md5"), Md5); Fields.put(new Text("data"), Vid); Fields.put(new Text("hdfs_path"), HdfsPath); }
@Test public void test_getWritable() throws Exception { assertTrue(NiFiOrcUtils.convertToORCObject(null, 1) instanceof IntWritable); assertTrue(NiFiOrcUtils.convertToORCObject(null, 1L) instanceof LongWritable); assertTrue(NiFiOrcUtils.convertToORCObject(null, 1.0f) instanceof FloatWritable); assertTrue(NiFiOrcUtils.convertToORCObject(null, 1.0) instanceof DoubleWritable); assertTrue(NiFiOrcUtils.convertToORCObject(null, new int[] {1, 2, 3}) instanceof List); assertTrue(NiFiOrcUtils.convertToORCObject(null, Arrays.asList(1, 2, 3)) instanceof List); Map<String, Float> map = new HashMap<>(); map.put("Hello", 1.0f); map.put("World", 2.0f); Object writable = NiFiOrcUtils.convertToORCObject( TypeInfoUtils.getTypeInfoFromTypeString("map<string,float>"), map); assertTrue(writable instanceof MapWritable); MapWritable mapWritable = (MapWritable) writable; mapWritable.forEach( (key, value) -> { assertTrue(key instanceof Text); assertTrue(value instanceof FloatWritable); }); }
private void processHtmlContent(MapWritable value, Metadata allMetadata, String uniqueId) throws IOException { BytesWritable htmlBytesWritable = (BytesWritable) value.get(new Text(ParameterProcessing.NATIVE_AS_HTML_NAME)); if (htmlBytesWritable != null) { String htmlNativeEntryName = ParameterProcessing.HTML_FOLDER + "/" + uniqueId + "_" + new File(allMetadata.get(DocumentMetadataKeys.DOCUMENT_ORIGINAL_PATH)).getName() + ".html"; zipFileWriter.addBinaryFile( htmlNativeEntryName, htmlBytesWritable.getBytes(), htmlBytesWritable.getLength()); logger.trace("Processing file: {}", htmlNativeEntryName); } // get the list with other files part of the html output Text htmlFiles = (Text) value.get(new Text(ParameterProcessing.NATIVE_AS_HTML)); if (htmlFiles != null) { String fileNames = htmlFiles.toString(); String[] fileNamesArr = fileNames.split(","); for (String fileName : fileNamesArr) { String entry = ParameterProcessing.HTML_FOLDER + "/" + fileName; BytesWritable imageBytesWritable = (BytesWritable) value.get(new Text(ParameterProcessing.NATIVE_AS_HTML + "_" + fileName)); if (imageBytesWritable != null) { zipFileWriter.addBinaryFile( entry, imageBytesWritable.getBytes(), imageBytesWritable.getLength()); logger.trace("Processing file: {}", entry); } } } }
@Override public boolean next(LongWritable keyHolder, MapWritable valueHolder) throws IOException { if (!cursor.hasNext()) { return false; } DBObject record = cursor.next(); keyHolder.set(pos); for (int i = 0; i < this.readColumns.length; i++) { String key = readColumns[i]; Object vObj = ("id".equals(key)) ? record.get("_id") : record.get(key); Writable value = (vObj == null) ? NullWritable.get() : new Text(vObj.toString()); valueHolder.put(new Text(key), value); } pos++; return true; }
@Override public boolean next(LongWritable keyHolder, MapWritable valueHolder) throws IOException { SolrDocument doc = cursor.nextDocument(); if (doc == null) { return false; } keyHolder.set(pos++); for (int i = 0; i < readColumns.length; i++) { String key = readColumns[i]; Object vObj = doc.getFieldValue(key); Writable value = (vObj == null) ? NullWritable.get() : new Text(vObj.toString()); valueHolder.put(new Text(key), value); } return true; }
public boolean next(LongWritable key, MapWritable value) throws IOException { try { if (!results.next()) { return false; } // Set the key field value as the output key value key.set(pos + split.getStart()); ResultSetMetaData resultsMetaData = results.getMetaData(); int columnCount = resultsMetaData.getColumnCount(); List<String> names = new ArrayList<String>(); List<Integer> types = new ArrayList<Integer>(); // The column count starts from 1 for (int i = 1; i <= columnCount; i++) { // This is the column name in db table String name = resultsMetaData.getColumnName(i).toLowerCase(); // Get the relevant metaTable name name = databaseProperties.getInputColumnMappingFields().get(name); int type = resultsMetaData.getColumnType(i); // Hive keeps column names in lowercase names.add(name.toLowerCase()); types.add(type); } for (int j = 0; j < types.size(); j++) { value.put(new Text(names.get(j)), getActualObjectTypeForValue(results, types, j)); } pos++; } catch (SQLException e) { throw new IOException(e.getMessage()); } return true; }
protected void processMap(MapWritable value) throws IOException, InterruptedException { columnMetadata.reinit(); ++outputFileCount; DocumentMetadata allMetadata = getAllMetadata(value); Metadata standardMetadata = getStandardMetadata(allMetadata, outputFileCount); columnMetadata.addMetadata(standardMetadata); columnMetadata.addMetadata(allMetadata); // documents other than the first one in this loop are either duplicates or attachments if (first) { masterOutputFileCount = outputFileCount; } else { if (allMetadata.hasParent()) { columnMetadata.addMetadataValue( DocumentMetadataKeys.ATTACHMENT_PARENT, UPIFormat.format(masterOutputFileCount)); } else { columnMetadata.addMetadataValue( DocumentMetadataKeys.MASTER_DUPLICATE, UPIFormat.format(masterOutputFileCount)); } } // String uniqueId = allMetadata.getUniqueId(); String originalFileName = new File(allMetadata.get(DocumentMetadataKeys.DOCUMENT_ORIGINAL_PATH)).getName(); // add the text to the text folder String documentText = allMetadata.get(DocumentMetadataKeys.DOCUMENT_TEXT); String textEntryName = ParameterProcessing.TEXT + "/" + UPIFormat.format(outputFileCount) + "_" + originalFileName + ".txt"; if (textEntryName != null) { zipFileWriter.addTextFile(textEntryName, documentText); } columnMetadata.addMetadataValue(DocumentMetadataKeys.LINK_TEXT, textEntryName); // add the native file to the native folder String nativeEntryName = ParameterProcessing.NATIVE + "/" + UPIFormat.format(outputFileCount) + "_" + originalFileName; BytesWritable bytesWritable = (BytesWritable) value.get(new Text(ParameterProcessing.NATIVE)); if (bytesWritable != null) { // some large exception files are not passed zipFileWriter.addBinaryFile( nativeEntryName, bytesWritable.getBytes(), bytesWritable.getLength()); logger.trace("Processing file: {}", nativeEntryName); } columnMetadata.addMetadataValue(DocumentMetadataKeys.LINK_NATIVE, nativeEntryName); // add the pdf made from native to the PDF folder String pdfNativeEntryName = ParameterProcessing.PDF_FOLDER + "/" + UPIFormat.format(outputFileCount) + "_" + new File(allMetadata.get(DocumentMetadataKeys.DOCUMENT_ORIGINAL_PATH)).getName() + ".pdf"; BytesWritable pdfBytesWritable = (BytesWritable) value.get(new Text(ParameterProcessing.NATIVE_AS_PDF)); if (pdfBytesWritable != null) { zipFileWriter.addBinaryFile( pdfNativeEntryName, pdfBytesWritable.getBytes(), pdfBytesWritable.getLength()); logger.trace("Processing file: {}", pdfNativeEntryName); } processHtmlContent(value, allMetadata, UPIFormat.format(outputFileCount)); // add exception to the exception folder String exception = allMetadata.get(DocumentMetadataKeys.PROCESSING_EXCEPTION); if (exception != null) { String exceptionEntryName = "exception/" + UPIFormat.format(outputFileCount) + "_" + new File(allMetadata.get(DocumentMetadataKeys.DOCUMENT_ORIGINAL_PATH)).getName(); if (bytesWritable != null) { zipFileWriter.addBinaryFile( exceptionEntryName, bytesWritable.getBytes(), bytesWritable.getLength()); } columnMetadata.addMetadataValue(DocumentMetadataKeys.LINK_EXCEPTION, exceptionEntryName); } // write this all to the reduce map // context.write(new Text(outputKey), new Text(columnMetadata.delimiterSeparatedValues())); // drop the key altogether, because it messes up the format - but put it in the value // TODO use NullWritable if (OsUtil.isNix()) { context.write(null, new Text(columnMetadata.delimiterSeparatedValues())); } // prepare for the next file with the same key, if there is any first = false; }
// Process an input document with GATE and a Reporter public synchronized BehemothDocument[] process(BehemothDocument inputDoc, Reporter reporter) { if (reporter != null) reporter.setStatus("GATE : " + inputDoc.getUrl().toString()); boolean clearBehemothAnnotations = config.getBoolean("gate.deleteBehemothAnnotations", false); // process the text passed as value with the application // a) create a GATE document based on the text value gate.Document gatedocument = null; try { gatedocument = generateGATEDoc(inputDoc); // add it to the current corpus corpus.add(gatedocument); // get the application and assign the corpus to it this.GATEapplication.setCorpus(corpus); // process it with GATE this.GATEapplication.execute(); AnnotationSet annots = null; if ("".equals(filters.getAnnotationSetName())) annots = gatedocument.getAnnotations(); else annots = gatedocument.getAnnotations(filters.getAnnotationSetName()); // enrich the input doc with the annotations from // the GATE application // transfer the annotations from the GATE document // to the Behemoth one using the filters List<com.digitalpebble.behemoth.Annotation> beheannotations = convertGATEAnnotationsToBehemoth(annots, inputDoc); // sort the annotations before adding them? Collections.sort(beheannotations); // clear the existing behemoth annotations if (clearBehemothAnnotations) { inputDoc.getAnnotations().clear(); } inputDoc.getAnnotations().addAll(beheannotations); // add counters about num of annotations added if (reporter != null) for (com.digitalpebble.behemoth.Annotation annot : beheannotations) { reporter.incrCounter("GATE", annot.getType(), 1); } // Add the document features from GATE to Behemoth Set<String> docFeatFilter = this.filters.getDocFeaturesFilter(); MapWritable beheMD = inputDoc.getMetadata(true); if (docFeatFilter.size() > 0) { for (String docFeatName : docFeatFilter) { Object featValue = gatedocument.getFeatures().get(docFeatName); if (featValue != null) { beheMD.put(new Text(docFeatName), new Text(featValue.toString())); } } } if (reporter != null) reporter.incrCounter("GATE", "Document", 1); } catch (Exception e) { LOG.error(inputDoc.getUrl().toString(), e); if (reporter != null) reporter.incrCounter("GATE", "Exceptions", 1); } finally { // remove the document from the corpus again corpus.clear(); // and from memory if (gatedocument != null) Factory.deleteResource(gatedocument); } // currently returns only the input document return new BehemothDocument[] {inputDoc}; }