@Override
    public boolean next(Text key, MapWritable value) throws IOException {
      if (result == null) {
        result = client.query(query);
      }

      boolean hasNext = result.hasNext();

      if (!hasNext) {
        return false;
      }

      Map<String, Object> next = result.next();
      // we save the key as is since under the old API, we don't have to create a new Text() object
      currentKey = next.get("_id").toString();
      currentValue = (MapWritable) WritableUtils.toWritable(next.get("_source"));

      if (key != null) {
        key.set(currentKey);
      }
      if (value != null) {
        value.clear();
        value.putAll(currentValue);
      }

      // keep on counting
      read++;
      return true;
    }
Ejemplo n.º 2
0
 public static Collection<Bucket> getBuckets(MapWritable key) {
   Collection<Bucket> bucketList = new ArrayList<Bucket>();
   MapWritable bucketMap = (MapWritable) key.get(MAPWRITABLE_BUCKET_KEY);
   for (Writable w : bucketMap.values()) {
     bucketList.add((Bucket) w);
   }
   return bucketList;
 }
Ejemplo n.º 3
0
  public MapWritable getBucketAsKey(BucketStripped bucketStripped) throws IOException {
    MapWritable mw = new MapWritable();
    MapWritable bucketMap = new MapWritable();

    Bucket bucket = getBucket(bucketStripped);
    bucketMap.put(new IntWritable(bucket.hashCode()), bucket);
    mw.put(MAPWRITABLE_BUCKET_KEY, bucketMap);
    return mw;
  }
Ejemplo n.º 4
0
  @Override
  public void remove() throws IOException {
    MapWritable msg = new MapWritable();
    msg.put(GraphJobRunner.FLAG_VERTEX_DECREASE, this.vertexID);

    // Get master task peer.
    String destPeer = GraphJobRunner.getMasterTask(this.getPeer());
    runner.getPeer().send(destPeer, new GraphJobMessage(msg));

    alterVertexCounter(-1);
  }
Ejemplo n.º 5
0
  public MapWritable toMapWritableKey() {
    MapWritable mw = new MapWritable();
    MapWritable bucketMap = new MapWritable();

    for (IntWritable key : bucketCache.keySet()) {
      bucketMap.put(key, bucketCache.get(key));
    }
    mw.put(MAPWRITABLE_BUCKET_KEY, bucketMap);

    return mw;
  }
Ejemplo n.º 6
0
  @Override
  public void addVertex(V vertexID, List<Edge<V, E>> edges, M value) throws IOException {
    MapWritable msg = new MapWritable();
    // Create the new vertex.
    Vertex<V, E, M> vertex = GraphJobRunner.<V, E, M>newVertexInstance(GraphJobRunner.VERTEX_CLASS);
    vertex.setEdges(edges);
    vertex.setValue(value);
    vertex.setVertexID(vertexID);

    msg.put(GraphJobRunner.FLAG_VERTEX_INCREASE, vertex);
    runner.getPeer().send(runner.getHostName(vertexID), new GraphJobMessage(msg));

    alterVertexCounter(1);
  }
Ejemplo n.º 7
0
  @Override
  public Writable serialize(final Object obj, final ObjectInspector inspector)
      throws SerDeException {

    final StructObjectInspector structInspector = (StructObjectInspector) inspector;
    final List<? extends StructField> fields = structInspector.getAllStructFieldRefs();
    if (fields.size() != columnNames.size()) {
      throw new SerDeException(
          String.format("Required %d columns, received %d.", columnNames.size(), fields.size()));
    }

    cachedWritable.clear();
    for (int c = 0; c < fieldCount; c++) {
      StructField structField = fields.get(c);

      LOG.debug("fieldId=" + c + ",structField=" + structField.toString());

      if (structField != null) {
        final Object field = structInspector.getStructFieldData(obj, fields.get(c));

        final AbstractPrimitiveObjectInspector fieldOI =
            (AbstractPrimitiveObjectInspector) fields.get(c).getFieldObjectInspector();

        Writable value = (Writable) fieldOI.getPrimitiveWritableObject(field);

        if (value == null) {
          continue;
        }

        LOG.debug("fieldCount=" + fieldCount + ",value=" + value.toString());
        if (value instanceof IntWritable) {
          cachedWritable.put(new Text(columnNames.get(c)), value);
        } else if (value instanceof Text) {
          cachedWritable.put(new Text(columnNames.get(c)), ((Text) value));
        } else if (value instanceof LongWritable) {
          cachedWritable.put(new Text(columnNames.get(c)), ((LongWritable) value));
        } else if (value instanceof DoubleWritable) {
          cachedWritable.put(new Text(columnNames.get(c)), ((DoubleWritable) value));
        } else if (value instanceof FloatWritable) {
          cachedWritable.put(new Text(columnNames.get(c)), ((FloatWritable) value));
        } else if (value instanceof BooleanWritable) {
          cachedWritable.put(new Text(columnNames.get(c)), ((BooleanWritable) value));
        } else if (value instanceof ByteWritable) {
          cachedWritable.put(new Text(columnNames.get(c)), ((ByteWritable) value));
        } else if (value instanceof BytesWritable) {
          cachedWritable.put(new Text(columnNames.get(c)), ((BytesWritable) value));
        } else {
          LOG.warn("fieldCount=" + fieldCount + ",type=" + value.getClass().getName());
        }
      }
    }

    return cachedWritable;
  }
Ejemplo n.º 8
0
 @Override
 public void write(Writable w) throws IOException {
   MapWritable map = (MapWritable) w;
   BasicDBObject dbo = new BasicDBObject();
   for (final Map.Entry<Writable, Writable> entry : map.entrySet()) {
     // System.err.println("Write: key=" + entry.getKey().toString()
     // + ", val=" + entry.getValue().toString());
     String key = entry.getKey().toString();
     //			if ("id".equals(key)) {
     //				key = "_id";
     //			}
     dbo.put(key, getObjectFromWritable(entry.getValue()));
   }
   table.save(dbo);
 }
  @Override
  protected void map(Text key, DocumentMapping value, Context context)
      throws IOException, InterruptedException {
    // no need vector for this MR
    double distance = value.getDistance();
    Vector vector = value.getVector();

    // map output
    MapWritable m = new MapWritable();
    m.put(ClusterEvaluatorMR.DOC_VECTOR_KEY, new VectorWritable(vector));
    m.put(ClusterEvaluatorMR.DISTANCE_KEY, new DoubleWritable(distance));
    m.put(ClusterEvaluatorMR.SQUARED_DISTANCE_KEY, new DoubleWritable(distance * distance));
    m.put(ClusterEvaluatorMR.COUNT_KEY, new LongWritable(1));

    context.write(key, m);
  }
Ejemplo n.º 10
0
 private DocumentMetadata getAllMetadata(MapWritable map) {
   DocumentMetadata metadata = new DocumentMetadata();
   Set<Writable> set = map.keySet();
   Iterator<Writable> iter = set.iterator();
   while (iter.hasNext()) {
     String name = iter.next().toString();
     if (!ParameterProcessing.NATIVE.equals(name)
         && !ParameterProcessing.NATIVE_AS_PDF.equals(name)
         && !name.startsWith(
             ParameterProcessing.NATIVE_AS_HTML)) { // all metadata but native - which is bytes!
       Text value = (Text) map.get(new Text(name));
       metadata.set(name, value.toString());
     }
   }
   return metadata;
 }
    public void reduce(Text key, Iterable<TermFrequencyWritable> values, Context context)
        throws IOException, InterruptedException {

      HashMap<Text, IntWritable> map = new HashMap<Text, IntWritable>();
      for (TermFrequencyWritable val : values) {
        Text docID = new Text(val.getDocumentID());
        int freq = val.getFreq().get();
        if (map.get(docID) != null) {
          map.put(docID, new IntWritable(map.get(docID).get() + freq));
        } else {
          map.put(docID, new IntWritable(freq));
        }
      }

      MapWritable outputMap = new MapWritable();
      outputMap.putAll(map);
      context.write(key, outputMap);
    }
 public MapWritable readMap(MapWritable mw) throws IOException {
   if (mw == null) {
     mw = new MapWritable();
   }
   int length = in.readMapHeader();
   for (int i = 0; i < length; i++) {
     Writable key = read();
     Writable value = read();
     mw.put(key, value);
   }
   return mw;
 }
 public SequenceFileExportMapper() {
   Fields.put(new Text("full_path"), FullPath);
   Fields.put(new Text("extension"), Ext);
   Fields.put(new Text("sha1"), Sha);
   Fields.put(new Text("md5"), Md5);
   Fields.put(new Text("data"), Vid);
   Fields.put(new Text("hdfs_path"), HdfsPath);
 }
Ejemplo n.º 14
0
  @Test
  public void test_getWritable() throws Exception {
    assertTrue(NiFiOrcUtils.convertToORCObject(null, 1) instanceof IntWritable);
    assertTrue(NiFiOrcUtils.convertToORCObject(null, 1L) instanceof LongWritable);
    assertTrue(NiFiOrcUtils.convertToORCObject(null, 1.0f) instanceof FloatWritable);
    assertTrue(NiFiOrcUtils.convertToORCObject(null, 1.0) instanceof DoubleWritable);
    assertTrue(NiFiOrcUtils.convertToORCObject(null, new int[] {1, 2, 3}) instanceof List);
    assertTrue(NiFiOrcUtils.convertToORCObject(null, Arrays.asList(1, 2, 3)) instanceof List);
    Map<String, Float> map = new HashMap<>();
    map.put("Hello", 1.0f);
    map.put("World", 2.0f);

    Object writable =
        NiFiOrcUtils.convertToORCObject(
            TypeInfoUtils.getTypeInfoFromTypeString("map<string,float>"), map);
    assertTrue(writable instanceof MapWritable);
    MapWritable mapWritable = (MapWritable) writable;
    mapWritable.forEach(
        (key, value) -> {
          assertTrue(key instanceof Text);
          assertTrue(value instanceof FloatWritable);
        });
  }
Ejemplo n.º 15
0
  private void processHtmlContent(MapWritable value, Metadata allMetadata, String uniqueId)
      throws IOException {
    BytesWritable htmlBytesWritable =
        (BytesWritable) value.get(new Text(ParameterProcessing.NATIVE_AS_HTML_NAME));
    if (htmlBytesWritable != null) {
      String htmlNativeEntryName =
          ParameterProcessing.HTML_FOLDER
              + "/"
              + uniqueId
              + "_"
              + new File(allMetadata.get(DocumentMetadataKeys.DOCUMENT_ORIGINAL_PATH)).getName()
              + ".html";
      zipFileWriter.addBinaryFile(
          htmlNativeEntryName, htmlBytesWritable.getBytes(), htmlBytesWritable.getLength());
      logger.trace("Processing file: {}", htmlNativeEntryName);
    }

    // get the list with other files part of the html output
    Text htmlFiles = (Text) value.get(new Text(ParameterProcessing.NATIVE_AS_HTML));
    if (htmlFiles != null) {
      String fileNames = htmlFiles.toString();
      String[] fileNamesArr = fileNames.split(",");
      for (String fileName : fileNamesArr) {
        String entry = ParameterProcessing.HTML_FOLDER + "/" + fileName;

        BytesWritable imageBytesWritable =
            (BytesWritable)
                value.get(new Text(ParameterProcessing.NATIVE_AS_HTML + "_" + fileName));
        if (imageBytesWritable != null) {
          zipFileWriter.addBinaryFile(
              entry, imageBytesWritable.getBytes(), imageBytesWritable.getLength());
          logger.trace("Processing file: {}", entry);
        }
      }
    }
  }
Ejemplo n.º 16
0
 @Override
 public boolean next(LongWritable keyHolder, MapWritable valueHolder) throws IOException {
   if (!cursor.hasNext()) {
     return false;
   }
   DBObject record = cursor.next();
   keyHolder.set(pos);
   for (int i = 0; i < this.readColumns.length; i++) {
     String key = readColumns[i];
     Object vObj = ("id".equals(key)) ? record.get("_id") : record.get(key);
     Writable value = (vObj == null) ? NullWritable.get() : new Text(vObj.toString());
     valueHolder.put(new Text(key), value);
   }
   pos++;
   return true;
 }
Ejemplo n.º 17
0
  @Override
  public boolean next(LongWritable keyHolder, MapWritable valueHolder) throws IOException {
    SolrDocument doc = cursor.nextDocument();
    if (doc == null) {
      return false;
    }

    keyHolder.set(pos++);

    for (int i = 0; i < readColumns.length; i++) {
      String key = readColumns[i];
      Object vObj = doc.getFieldValue(key);
      Writable value = (vObj == null) ? NullWritable.get() : new Text(vObj.toString());
      valueHolder.put(new Text(key), value);
    }
    return true;
  }
Ejemplo n.º 18
0
  public boolean next(LongWritable key, MapWritable value) throws IOException {
    try {
      if (!results.next()) {
        return false;
      }

      // Set the key field value as the output key value
      key.set(pos + split.getStart());

      ResultSetMetaData resultsMetaData = results.getMetaData();
      int columnCount = resultsMetaData.getColumnCount();

      List<String> names = new ArrayList<String>();
      List<Integer> types = new ArrayList<Integer>();
      // The column count starts from 1
      for (int i = 1; i <= columnCount; i++) {
        // This is the column name in db table
        String name = resultsMetaData.getColumnName(i).toLowerCase();
        // Get the relevant metaTable name
        name = databaseProperties.getInputColumnMappingFields().get(name);
        int type = resultsMetaData.getColumnType(i);
        // Hive keeps column names in lowercase
        names.add(name.toLowerCase());
        types.add(type);
      }

      for (int j = 0; j < types.size(); j++) {
        value.put(new Text(names.get(j)), getActualObjectTypeForValue(results, types, j));
      }

      pos++;
    } catch (SQLException e) {
      throw new IOException(e.getMessage());
    }
    return true;
  }
Ejemplo n.º 19
0
  protected void processMap(MapWritable value) throws IOException, InterruptedException {
    columnMetadata.reinit();
    ++outputFileCount;
    DocumentMetadata allMetadata = getAllMetadata(value);
    Metadata standardMetadata = getStandardMetadata(allMetadata, outputFileCount);
    columnMetadata.addMetadata(standardMetadata);
    columnMetadata.addMetadata(allMetadata);
    // documents other than the first one in this loop are either duplicates or attachments
    if (first) {
      masterOutputFileCount = outputFileCount;
    } else {
      if (allMetadata.hasParent()) {
        columnMetadata.addMetadataValue(
            DocumentMetadataKeys.ATTACHMENT_PARENT, UPIFormat.format(masterOutputFileCount));
      } else {
        columnMetadata.addMetadataValue(
            DocumentMetadataKeys.MASTER_DUPLICATE, UPIFormat.format(masterOutputFileCount));
      }
    }

    // String uniqueId = allMetadata.getUniqueId();

    String originalFileName =
        new File(allMetadata.get(DocumentMetadataKeys.DOCUMENT_ORIGINAL_PATH)).getName();
    // add the text to the text folder
    String documentText = allMetadata.get(DocumentMetadataKeys.DOCUMENT_TEXT);
    String textEntryName =
        ParameterProcessing.TEXT
            + "/"
            + UPIFormat.format(outputFileCount)
            + "_"
            + originalFileName
            + ".txt";
    if (textEntryName != null) {
      zipFileWriter.addTextFile(textEntryName, documentText);
    }
    columnMetadata.addMetadataValue(DocumentMetadataKeys.LINK_TEXT, textEntryName);
    // add the native file to the native folder
    String nativeEntryName =
        ParameterProcessing.NATIVE
            + "/"
            + UPIFormat.format(outputFileCount)
            + "_"
            + originalFileName;
    BytesWritable bytesWritable = (BytesWritable) value.get(new Text(ParameterProcessing.NATIVE));
    if (bytesWritable != null) { // some large exception files are not passed
      zipFileWriter.addBinaryFile(
          nativeEntryName, bytesWritable.getBytes(), bytesWritable.getLength());
      logger.trace("Processing file: {}", nativeEntryName);
    }
    columnMetadata.addMetadataValue(DocumentMetadataKeys.LINK_NATIVE, nativeEntryName);
    // add the pdf made from native to the PDF folder
    String pdfNativeEntryName =
        ParameterProcessing.PDF_FOLDER
            + "/"
            + UPIFormat.format(outputFileCount)
            + "_"
            + new File(allMetadata.get(DocumentMetadataKeys.DOCUMENT_ORIGINAL_PATH)).getName()
            + ".pdf";
    BytesWritable pdfBytesWritable =
        (BytesWritable) value.get(new Text(ParameterProcessing.NATIVE_AS_PDF));
    if (pdfBytesWritable != null) {
      zipFileWriter.addBinaryFile(
          pdfNativeEntryName, pdfBytesWritable.getBytes(), pdfBytesWritable.getLength());
      logger.trace("Processing file: {}", pdfNativeEntryName);
    }

    processHtmlContent(value, allMetadata, UPIFormat.format(outputFileCount));

    // add exception to the exception folder
    String exception = allMetadata.get(DocumentMetadataKeys.PROCESSING_EXCEPTION);
    if (exception != null) {
      String exceptionEntryName =
          "exception/"
              + UPIFormat.format(outputFileCount)
              + "_"
              + new File(allMetadata.get(DocumentMetadataKeys.DOCUMENT_ORIGINAL_PATH)).getName();
      if (bytesWritable != null) {
        zipFileWriter.addBinaryFile(
            exceptionEntryName, bytesWritable.getBytes(), bytesWritable.getLength());
      }
      columnMetadata.addMetadataValue(DocumentMetadataKeys.LINK_EXCEPTION, exceptionEntryName);
    }
    // write this all to the reduce map
    // context.write(new Text(outputKey), new Text(columnMetadata.delimiterSeparatedValues()));
    // drop the key altogether, because it messes up the format - but put it in the value
    // TODO use NullWritable
    if (OsUtil.isNix()) {
      context.write(null, new Text(columnMetadata.delimiterSeparatedValues()));
    }
    // prepare for the next file with the same key, if there is any
    first = false;
  }
Ejemplo n.º 20
0
  // Process an input document with GATE and a Reporter
  public synchronized BehemothDocument[] process(BehemothDocument inputDoc, Reporter reporter) {
    if (reporter != null) reporter.setStatus("GATE : " + inputDoc.getUrl().toString());

    boolean clearBehemothAnnotations = config.getBoolean("gate.deleteBehemothAnnotations", false);

    // process the text passed as value with the application
    // a) create a GATE document based on the text value
    gate.Document gatedocument = null;
    try {

      gatedocument = generateGATEDoc(inputDoc);
      // add it to the current corpus
      corpus.add(gatedocument);
      // get the application and assign the corpus to it
      this.GATEapplication.setCorpus(corpus);
      // process it with GATE
      this.GATEapplication.execute();

      AnnotationSet annots = null;
      if ("".equals(filters.getAnnotationSetName())) annots = gatedocument.getAnnotations();
      else annots = gatedocument.getAnnotations(filters.getAnnotationSetName());

      // enrich the input doc with the annotations from
      // the GATE application
      // transfer the annotations from the GATE document
      // to the Behemoth one using the filters
      List<com.digitalpebble.behemoth.Annotation> beheannotations =
          convertGATEAnnotationsToBehemoth(annots, inputDoc);

      // sort the annotations before adding them?
      Collections.sort(beheannotations);

      // clear the existing behemoth annotations
      if (clearBehemothAnnotations) {
        inputDoc.getAnnotations().clear();
      }

      inputDoc.getAnnotations().addAll(beheannotations);

      // add counters about num of annotations added
      if (reporter != null)
        for (com.digitalpebble.behemoth.Annotation annot : beheannotations) {
          reporter.incrCounter("GATE", annot.getType(), 1);
        }

      // Add the document features from GATE to Behemoth
      Set<String> docFeatFilter = this.filters.getDocFeaturesFilter();
      MapWritable beheMD = inputDoc.getMetadata(true);
      if (docFeatFilter.size() > 0) {
        for (String docFeatName : docFeatFilter) {
          Object featValue = gatedocument.getFeatures().get(docFeatName);
          if (featValue != null) {
            beheMD.put(new Text(docFeatName), new Text(featValue.toString()));
          }
        }
      }

      if (reporter != null) reporter.incrCounter("GATE", "Document", 1);

    } catch (Exception e) {
      LOG.error(inputDoc.getUrl().toString(), e);
      if (reporter != null) reporter.incrCounter("GATE", "Exceptions", 1);
    } finally {
      // remove the document from the corpus again
      corpus.clear();
      // and from memory
      if (gatedocument != null) Factory.deleteResource(gatedocument);
    }
    // currently returns only the input document
    return new BehemothDocument[] {inputDoc};
  }