Exemplo n.º 1
0
    @Override
    public void reduce(Text key, Iterable<Text> values, Context context)
        throws IOException, InterruptedException {
      // For each value, figure out which file it's from and store it
      // accordingly.
      List<String> first = new ArrayList<String>();
      List<String> second = new ArrayList<String>();

      for (Text value : values) {
        if (value.charAt(0) == '1') {
          first.add(value.toString().substring(1));
        } else second.add(value.toString().substring(1));
        context.setStatus("OK");
      }

      context.setStatus("OK");

      if (first.size() == 0) return;
      if (second.size() == 0) second.add(null);

      // Do the cross product
      for (String s1 : first) {
        for (String s2 : second) {
          if (s2 == null) OUT.set(key.toString() + "\t" + s1 + "\t\t");
          else OUT.set(key.toString() + "\t" + s1 + "\t" + key.toString() + "\t" + s2);
          context.write(NULL, OUT);
        }
      }
    }
Exemplo n.º 2
0
    @Override
    public void map(LongWritable row, NullWritable ignored, Context context)
        throws IOException, InterruptedException {
      context.setStatus("Entering");
      long rowId = row.get();
      if (rand == null) {
        // we use 3 random numbers per a row
        rand = new RandomGenerator(rowId * 3);
      }
      addKey();
      value.clear();
      // addRowId(rowId);
      addFiller(rowId);

      // New
      Mutation m = new Mutation(key);
      m.put(
          new Text("c"), // column family
          getRowIdString(rowId), // column qual
          new Value(value.toString().getBytes())); // data

      context.setStatus("About to add to accumulo");
      context.write(tableName, m);
      context.setStatus("Added to accumulo " + key.toString());
    }
Exemplo n.º 3
0
  protected void baseQualityScoreRecalibration(
      Context context,
      String region,
      ChromosomeRange r,
      PreprocessingTools tools,
      GATKTools gatk,
      String input,
      String output)
      throws InterruptedException, IOException, URISyntaxException {
    String table = tmpFileBase + ".table";

    // get snp database(s)
    String[] snpslocal = HalvadeFileUtils.downloadSites(context, taskId);
    String[] newKnownSites = new String[snpslocal.length];
    for (int i = 0; i < snpslocal.length; i++) {
      if (filterDBsnp) {
        newKnownSites[i] =
            tools.filterDBSnps(
                ref.replaceAll("fasta", "dict"), snpslocal[i], r, tmpFileBase, threads);
      } else {
        newKnownSites[i] = snpslocal[i];
      }
      if (newKnownSites[i].endsWith(".gz")) {
        newKnownSites[i] = HalvadeFileUtils.Unzip(newKnownSites[i]);
      }
    }

    // should be created automatically by GATK v3.0 or higher
    //        Logger.DEBUG("build bam index");
    //        context.setStatus("build bam index");
    //        tools.runBuildBamIndex(tmpFile1);
    Logger.DEBUG("run baseRecalibrator");
    context.setStatus("run baseRecalibrator");
    context.getCounter(HalvadeCounters.TOOLS_GATK).increment(1);
    gatk.runBaseRecalibrator(input, table, ref, newKnownSites, region);

    Logger.DEBUG("run printReads");
    context.setStatus("run printReads");
    context.getCounter(HalvadeCounters.TOOLS_GATK).increment(1);
    gatk.runPrintReads(input, output, ref, table, region);

    HalvadeFileUtils.removeLocalFile(keep, input, context, HalvadeCounters.FOUT_GATK_TMP);
    HalvadeFileUtils.removeLocalFile(keep, input.replaceAll(".bam", ".bai"));
    HalvadeFileUtils.removeLocalFile(keep, table, context, HalvadeCounters.FOUT_GATK_TMP);
    for (int i = 0; i < newKnownSites.length; i++) {
      if (filterDBsnp) {
        HalvadeFileUtils.removeLocalFile(
            keep, newKnownSites[i], context, HalvadeCounters.FOUT_GATK_TMP);
      }
    }
  }
Exemplo n.º 4
0
  protected void elPrepPreprocess(
      Context context, PreprocessingTools tools, SAMRecordIterator input, String output)
      throws InterruptedException, IOException, QualityException, URISyntaxException {
    String dictF = ref.substring(0, ref.lastIndexOf('.')) + ".dict";
    String rg = createReadGroupRecordString(RGID, RGLB, RGPL, RGPU, RGSM);
    String preSamOut = tmpFileBase + "-p1.sam";
    String samOut = tmpFileBase + "-p2.sam";
    String fCounts = tmpFileBase + "-features.count";

    outHeader = header.clone();
    outHeader.setSortOrder(SAMFileHeader.SortOrder.coordinate);

    Logger.DEBUG("call elPrep");
    context.setStatus("call elPrep");
    int reads;
    if (keep) {
      reads =
          tools.callElPrep(
              preSamOut, samOut, inputIsBam ? null : rg, threads, input, outHeader, dictF);
    } else {
      reads =
          tools.streamElPrep(
              context, samOut, inputIsBam ? null : rg, threads, input, outHeader, dictF);
    }

    Logger.DEBUG(reads + " reads processed in elPrep");
    context.getCounter(HalvadeCounters.IN_PREP_READS).increment(reads);

    if (gff != null) {
      Logger.DEBUG("featureCounts");
      context.setStatus("featureCounts");
      tools.runFeatureCounts(gff, samOut, fCounts, threads);
      HalvadeFileUtils.uploadFileToHDFS(
          context,
          FileSystem.get(new URI(outputdir), context.getConfiguration()),
          fCounts,
          outputdir + context.getTaskAttemptID().toString() + ".count");
    }
    context.setStatus("convert SAM to BAM");
    Logger.DEBUG("convert SAM to BAM");
    tools.callSAMToBAM(samOut, output, threads);
    context.setStatus("build bam index");
    Logger.DEBUG("build bam index");
    tools.runBuildBamIndex(output);
    // remove temporary files
    HalvadeFileUtils.removeLocalFile(keep, preSamOut, context, HalvadeCounters.FOUT_GATK_TMP);
    HalvadeFileUtils.removeLocalFile(keep, samOut, context, HalvadeCounters.FOUT_GATK_TMP);
    HalvadeFileUtils.removeLocalFile(keep, fCounts);
  }
Exemplo n.º 5
0
  protected void RnaVariantCalling(
      Context context, String region, GATKTools gatk, String input, String output)
      throws InterruptedException {
    // choose between unifiendgenotyper vs haplotypegenotyper
    Logger.DEBUG("run variantCaller");
    context.setStatus("run variantCaller");
    context.getCounter(HalvadeCounters.TOOLS_GATK).increment(1);
    gatk.runHaplotypeCaller(input, output, true, scc, sec, ref, null, region);

    context.setStatus("cleanup");
    context.getCounter(HalvadeCounters.OUT_VCF_FILES).increment(1);

    HalvadeFileUtils.removeLocalFile(keep, input, context, HalvadeCounters.FOUT_GATK_TMP);
    HalvadeFileUtils.removeLocalFile(keep, input.replaceAll(".bam", ".bai"));
  }
    @Override
    public void map(LongWritable key, Text line, Context context) throws IOException {

      StringTokenizer stringTokenizer = new StringTokenizer(line.toString(), "\t");
      byte[] row = Bytes.toBytes(stringTokenizer.nextToken());
      System.out.println(row);
      Put put = new Put(row);

      put.add(rawUpdateColumnFamily, OwnerUserId, stringTokenizer.nextToken().getBytes());
      put.add(rawUpdateColumnFamily, CreationDate, stringTokenizer.nextToken().getBytes());
      put.add(rawUpdateColumnFamily, PostTypeId, stringTokenizer.nextToken().getBytes());
      put.add(rawUpdateColumnFamily, Title, stringTokenizer.nextToken().getBytes());
      put.add(rawUpdateColumnFamily, ViewCount, stringTokenizer.nextToken().getBytes());
      put.add(rawUpdateColumnFamily, AnswerCount, stringTokenizer.nextToken().getBytes());
      put.add(rawUpdateColumnFamily, CommentCount, stringTokenizer.nextToken().getBytes());
      put.add(rawUpdateColumnFamily, FavouriteCount, stringTokenizer.nextToken().getBytes());
      put.add(rawUpdateColumnFamily, ClosedDate, stringTokenizer.nextToken().getBytes());

      System.out.println(put);

      try {
        context.write(new ImmutableBytesWritable(row), put);
      } catch (InterruptedException e) {
        e.printStackTrace();
      }
      if (++count % statuspoint == 0) {
        context.setStatus("Emitting Put " + count);
      }
    } // map
Exemplo n.º 7
0
 public void reduce(IntWritable key, Iterable<IntWritable> values, Context context)
     throws IOException {
   try {
     LOG.info(
         "Reading in "
             + vertexName
             + " taskid "
             + context.getTaskAttemptID().getTaskID().getId()
             + " key "
             + key.get());
     LOG.info(
         "Sleeping in FinalReduce"
             + ", vertexName="
             + vertexName
             + ", taskAttemptId="
             + context.getTaskAttemptID()
             + ", reduceSleepDuration="
             + reduceSleepDuration
             + ", reduceSleepCount="
             + reduceSleepCount
             + ", sleepLeft="
             + (reduceSleepDuration * (reduceSleepCount - count)));
     context.setStatus(
         "Sleeping... (" + (reduceSleepDuration * (reduceSleepCount - count)) + ") ms left");
     if ((reduceSleepCount - count) > 0) {
       Thread.sleep(reduceSleepDuration);
     }
   } catch (InterruptedException ex) {
     throw (IOException) new IOException("Interrupted while sleeping").initCause(ex);
   }
   count++;
 }
Exemplo n.º 8
0
    /**
     * Map method.
     *
     * @param offset samples starting from the (offset+1)th sample.
     * @param size the number of samples for this map
     * @param context output {ture-&gt;numInside, false-&gt;numOutside}
     */
    public void map(LongWritable offset, LongWritable size, Context context)
        throws IOException, InterruptedException {

      final HaltonSequence haltonsequence = new HaltonSequence(offset.get());
      long numInside = 0L;
      long numOutside = 0L;

      for (long i = 0; i < size.get(); ) {
        // generate points in a unit square
        final double[] point = haltonsequence.nextPoint();

        // count points inside/outside of the inscribed circle of the square
        final double x = point[0] - 0.5;
        final double y = point[1] - 0.5;
        if (x * x + y * y > 0.25) {
          numOutside++;
        } else {
          numInside++;
        }

        // report status
        i++;
        if (i % 1000 == 0) {
          context.setStatus("Generated " + i + " samples.");
        }
      }

      // output map results
      context.write(new BooleanWritable(true), new LongWritable(numInside));
      context.write(new BooleanWritable(false), new LongWritable(numOutside));
    }
 @Override
 protected void cleanup(Context context) throws IOException, InterruptedException {
   final String tmpDirLoc = context.getConfiguration().get(TMP_FILE_LOC_KEY);
   final File tmpDir = Paths.get(tmpDirLoc).toFile();
   FileUtils.deleteDirectory(tmpDir);
   context.progress();
   context.setStatus("Clean");
 }
Exemplo n.º 10
0
  protected void indelRealignment(
      Context context, String region, GATKTools gatk, String input, String output)
      throws InterruptedException {
    String targets = tmpFileBase + ".intervals";

    Logger.DEBUG("run RealignerTargetCreator");
    context.setStatus("run RealignerTargetCreator");
    context.getCounter(HalvadeCounters.TOOLS_GATK).increment(1);
    gatk.runRealignerTargetCreator(input, targets, ref, region);

    Logger.DEBUG("run IndelRealigner");
    context.setStatus("run IndelRealigner");
    context.getCounter(HalvadeCounters.TOOLS_GATK).increment(1);
    gatk.runIndelRealigner(input, targets, output, ref, region);

    HalvadeFileUtils.removeLocalFile(keep, input, context, HalvadeCounters.FOUT_GATK_TMP);
    HalvadeFileUtils.removeLocalFile(keep, input.replaceAll(".bam", ".bai"));
    HalvadeFileUtils.removeLocalFile(keep, targets, context, HalvadeCounters.FOUT_GATK_TMP);
  }
Exemplo n.º 11
0
  // TODO improve annotate/filter
  protected void filterVariants(
      Context context, String region, GATKTools gatk, String input, String output)
      throws InterruptedException {
    Logger.DEBUG("run VariantFiltration");
    context.setStatus("run VariantFiltration");
    context.getCounter(HalvadeCounters.TOOLS_GATK).increment(1);
    gatk.runVariantFiltration(input, output, ref, region, windows, cluster, minFS, maxQD);

    HalvadeFileUtils.removeLocalFile(keep, input, context, HalvadeCounters.FOUT_GATK_TMP);
  }
Exemplo n.º 12
0
  protected void annotateVariants(
      Context context, String region, GATKTools gatk, String input, String output)
      throws InterruptedException {
    Logger.DEBUG("run VariantAnnotator");
    context.setStatus("run VariantAnnotator");
    context.getCounter(HalvadeCounters.TOOLS_GATK).increment(1);
    gatk.runVariantAnnotator(input, output, ref, region);

    HalvadeFileUtils.removeLocalFile(keep, input, context, HalvadeCounters.FOUT_GATK_TMP);
  }
Exemplo n.º 13
0
 @Override
 protected void map(ImmutableBytesWritable row, Result value, Context context)
     throws IOException, InterruptedException {
   // TODO Auto-generated method stub
   ImmutableBytesWritable userKey = new ImmutableBytesWritable(row.get(), 0, Bytes.SIZEOF_INT);
   context.write(userKey, one);
   numRecords++;
   if ((numRecords % 1000) == 0) {
     context.setStatus("mapper processed" + numRecords + " records so far");
   }
 }
Exemplo n.º 14
0
  protected void splitNTrim(
      Context context, String region, GATKTools gatk, String input, String output)
      throws InterruptedException {
    Logger.DEBUG("run SplitNCigarReads");
    context.setStatus("run SplitNCigarReads");
    context.getCounter(HalvadeCounters.TOOLS_GATK).increment(1);
    gatk.runSplitNCigarReads(input, output, ref, region, newMaxQualScore);

    HalvadeFileUtils.removeLocalFile(keep, input, context, HalvadeCounters.FOUT_GATK_TMP);
    HalvadeFileUtils.removeLocalFile(keep, input.replaceAll(".bam", ".bai"));
  }
Exemplo n.º 15
0
 public void map(IntWritable key, IntWritable value, Context context)
     throws IOException, InterruptedException {
   // it is expected that every map processes mapSleepCount number of records.
   try {
     LOG.info(
         "Reading in "
             + vertexName
             + " taskid "
             + context.getTaskAttemptID().getTaskID().getId()
             + " key "
             + key.get());
     LOG.info(
         "Sleeping in InitialMap"
             + ", vertexName="
             + vertexName
             + ", taskAttemptId="
             + context.getTaskAttemptID()
             + ", mapSleepDuration="
             + mapSleepDuration
             + ", mapSleepCount="
             + mapSleepCount
             + ", sleepLeft="
             + (mapSleepDuration * (mapSleepCount - count)));
     context.setStatus(
         "Sleeping... (" + (mapSleepDuration * (mapSleepCount - count)) + ") ms left");
     if ((mapSleepCount - count) > 0) {
       Thread.sleep(mapSleepDuration);
     }
     if (throwError || throwFatal) {
       throw new IOException("Throwing a simulated error from map");
     }
   } catch (InterruptedException ex) {
     throw (IOException) new IOException("Interrupted while sleeping").initCause(ex);
   }
   ++count;
   // output reduceSleepCount * numReduce number of random values, so that
   // each reducer will get reduceSleepCount number of keys.
   int k = key.get();
   for (int i = 0; i < value.get(); ++i) {
     LOG.info(
         "Writing in "
             + vertexName
             + " taskid "
             + context.getTaskAttemptID().getTaskID().getId()
             + " key "
             + (k + i)
             + " value 1");
     context.write(new IntWritable(k + i), new IntWritable(1));
   }
 }
Exemplo n.º 16
0
    public void reduce(IntWritable key, Iterable<IntWritable> values, Context context)
        throws IOException, InterruptedException {
      try {
        LOG.info(
            "Reading in "
                + vertexName
                + " taskid "
                + context.getTaskAttemptID().getTaskID().getId()
                + " key "
                + key.get());

        LOG.info(
            "Sleeping in IntermediateReduce"
                + ", vertexName="
                + vertexName
                + ", taskAttemptId="
                + context.getTaskAttemptID()
                + ", iReduceSleepDuration="
                + iReduceSleepDuration
                + ", iReduceSleepCount="
                + iReduceSleepCount
                + ", sleepLeft="
                + (iReduceSleepDuration * (iReduceSleepCount - count)));
        context.setStatus(
            "Sleeping... (" + (iReduceSleepDuration * (iReduceSleepCount - count)) + ") ms left");
        if ((iReduceSleepCount - count) > 0) {
          Thread.sleep(iReduceSleepDuration);
        }
      } catch (InterruptedException ex) {
        throw (IOException) new IOException("Interrupted while sleeping").initCause(ex);
      }
      ++count;
      // output reduceSleepCount * numReduce number of random values, so that
      // each reducer will get reduceSleepCount number of keys.
      int k = key.get();
      for (IntWritable value : values) {
        for (int i = 0; i < value.get(); ++i) {
          LOG.info(
              "Writing in "
                  + vertexName
                  + " taskid "
                  + context.getTaskAttemptID().getTaskID().getId()
                  + " key "
                  + (k + i)
                  + " value 1");
          context.write(new IntWritable(k + i), new IntWritable(1));
        }
      }
    }
Exemplo n.º 17
0
 @Override
 protected void reduce(K row, Iterable<Put> vals, Context context)
     throws IOException, InterruptedException {
   // Using HeapSize to create an upper bound on the memory size of
   // the puts and flush some portion of the content while looping. This
   // flush could result in multiple Puts for a single rowkey. That is
   // acceptable because Combiner is run as an optimization and it's not
   // critical that all Puts are grouped perfectly.
   long threshold =
       context.getConfiguration().getLong("putcombiner.row.threshold", 1L * (1 << 30));
   int cnt = 0;
   long curSize = 0;
   Put put = null;
   Map<byte[], List<Cell>> familyMap = null;
   for (Put p : vals) {
     cnt++;
     if (put == null) {
       put = p;
       familyMap = put.getFamilyCellMap();
     } else {
       for (Entry<byte[], List<Cell>> entry : p.getFamilyCellMap().entrySet()) {
         List<Cell> cells = familyMap.get(entry.getKey());
         List<Cell> kvs = (cells != null) ? (List<Cell>) cells : null;
         for (Cell cell : entry.getValue()) {
           KeyValue kv = KeyValueUtil.ensureKeyValue(cell);
           curSize += kv.heapSize();
           if (kvs != null) {
             kvs.add(kv);
           }
         }
         if (cells == null) {
           familyMap.put(entry.getKey(), entry.getValue());
         }
       }
       if (cnt % 10 == 0) context.setStatus("Combine " + cnt);
       if (curSize > threshold) {
         LOG.info(String.format("Combined %d Put(s) into %d.", cnt, 1));
         context.write(row, put);
         put = null;
         cnt = 0;
       }
     }
   }
   if (put != null) {
     LOG.info(String.format("Combined %d Put(s) into %d.", cnt, 1));
     context.write(row, put);
   }
 }
Exemplo n.º 18
0
  private void safeMap(LongWritable filePosition, Text line, Context context)
      throws IOException, InterruptedException {
    String lineString = line.toString();
    int colonOffet = lineString.indexOf(':');
    if (colonOffet < 1) {
      return;
    }
    long userId = Long.parseLong(lineString.substring(0, colonOffet));
    context.setStatus("User: "******"Friendster MR", "", user, visibility);
    context.write(key, AccumuloSession.createMutationFromRow(audit));

    String friends = lineString.substring(colonOffet + 1).trim();
    if ("notfound".equals(friends) || "private".equals(friends)) {
      // do nothing?
    } else {
      String[] friendsArray = friends.split(",");
      for (String friend : friendsArray) {
        friend = friend.trim();
        if (friend.length() == 0) {
          continue;
        }
        long friendId = Long.parseLong(friend);
        Vertex friendVertex = createUserVertex(friendId);
        addEdge(
            ImportMR.getFriendEdgeId(userVertex, friendVertex),
            userVertex,
            friendVertex,
            FriendsterOntology.EDGE_LABEL_FRIEND,
            visibility,
            authorizations);
        context.getCounter(FriendsterImportCounters.FRIEND_EDGES_CREATED).increment(1);
      }
    }

    context.getCounter(FriendsterImportCounters.USERS_PROCESSED).increment(1);
  }
Exemplo n.º 19
0
    @Override
    protected void map(LongWritable key, Text value, Context context)
        throws IOException, InterruptedException {
      String all[] = value.toString().split("/t");
      if (all.length == 2) {
        put = new Put(Bytes.toBytes(all[0]));
        put.add(Bytes.toBytes("xxx"), Bytes.toBytes("20110313"), Bytes.toBytes(all[1]));
      }

      if (!wal) {
        put.setDurability(Durability.SKIP_WAL);
      }

      table.put(put);
      if ((++count % 100) == 0) {
        context.setStatus(count + " DOCUMENTS done!");
        context.progress();
        System.out.println(count + " DOCUMENTS done!");
      }
    }
Exemplo n.º 20
0
    @Override
    public void map(LongWritable key, Text line, Context context) throws IOException {

      // Input is a CSV file
      // Each map() is a single line, where the key is the line number
      // Each line is comma-delimited; row,family,qualifier,value

      // Split CSV line
      String[] values = line.toString().split(",");
      if (values.length != 4) {
        return;
      }

      // Extract each value
      byte[] row = Bytes.toBytes(values[0]);
      byte[] family = Bytes.toBytes(values[1]);
      byte[] qualifier = Bytes.toBytes(values[2]);
      byte[] value = Bytes.toBytes(values[3]);

      // Create Put
      Put put = new Put(row);
      put.add(family, qualifier, value);

      // Uncomment below to disable WAL. This will improve performance but means
      // you will experience data loss in the case of a RegionServer crash.
      // put.setWriteToWAL(false);

      try {
        context.write(new ImmutableBytesWritable(row), put);
      } catch (InterruptedException e) {
        e.printStackTrace();
      }

      // Set status every checkpoint lines
      if (++count % checkpoint == 0) {
        context.setStatus("Emitting Put " + count);
      }
    }
Exemplo n.º 21
0
    @Override
    protected void map(String key, String value, final Context context)
        throws IOException, InterruptedException {
      final InputSplit split = context.getInputSplit();
      if (!(split instanceof DatasourceInputSplit)) {
        throw new IAE(
            "Unexpected split type. Expected [%s] was [%s]",
            DatasourceInputSplit.class.getCanonicalName(), split.getClass().getCanonicalName());
      }

      final String tmpDirLoc = context.getConfiguration().get(TMP_FILE_LOC_KEY);
      final File tmpDir = Paths.get(tmpDirLoc).toFile();

      final DataSegment segment =
          Iterables.getOnlyElement(((DatasourceInputSplit) split).getSegments()).getSegment();

      final HadoopDruidConverterConfig config =
          converterConfigFromConfiguration(context.getConfiguration());

      context.setStatus("DOWNLOADING");
      context.progress();
      final Path inPath = new Path(JobHelper.getURIFromSegment(segment));
      final File inDir = new File(tmpDir, "in");

      if (inDir.exists() && !inDir.delete()) {
        log.warn("Could not delete [%s]", inDir);
      }

      if (!inDir.mkdir() && (!inDir.exists() || inDir.isDirectory())) {
        log.warn("Unable to make directory");
      }

      final long inSize =
          JobHelper.unzipNoGuava(inPath, context.getConfiguration(), inDir, context);
      log.debug("Loaded %d bytes into [%s] for converting", inSize, inDir.getAbsolutePath());
      context.getCounter(COUNTER_GROUP, COUNTER_LOADED).increment(inSize);

      context.setStatus("CONVERTING");
      context.progress();
      final File outDir = new File(tmpDir, "out");
      if (!outDir.mkdir() && (!outDir.exists() || !outDir.isDirectory())) {
        throw new IOException(String.format("Could not create output directory [%s]", outDir));
      }
      HadoopDruidConverterConfig.INDEX_MERGER.convert(
          inDir, outDir, config.getIndexSpec(), JobHelper.progressIndicatorForContext(context));
      if (config.isValidate()) {
        context.setStatus("Validating");
        HadoopDruidConverterConfig.INDEX_IO.validateTwoSegments(inDir, outDir);
      }
      context.progress();
      context.setStatus("Starting PUSH");
      final Path baseOutputPath = new Path(config.getSegmentOutputPath());
      final FileSystem outputFS = baseOutputPath.getFileSystem(context.getConfiguration());
      final DataSegment finalSegmentTemplate =
          segment.withVersion(segment.getVersion() + "_converted");
      final DataSegment finalSegment =
          JobHelper.serializeOutIndex(
              finalSegmentTemplate,
              context.getConfiguration(),
              context,
              context.getTaskAttemptID(),
              outDir,
              JobHelper.makeSegmentOutputPath(baseOutputPath, outputFS, finalSegmentTemplate));
      context.progress();
      context.setStatus("Finished PUSH");
      final String finalSegmentString =
          HadoopDruidConverterConfig.jsonMapper.writeValueAsString(finalSegment);
      context
          .getConfiguration()
          .set(ConvertingOutputFormat.PUBLISHED_SEGMENT_KEY, finalSegmentString);
      context.write(new Text("dataSegment"), new Text(finalSegmentString));

      context.getCounter(COUNTER_GROUP, COUNTER_WRITTEN).increment(finalSegment.getSize());
      context.progress();
      context.setStatus("Ready To Commit");
    }
Exemplo n.º 22
0
  protected void PicardPreprocess(
      Context context, PreprocessingTools tools, SAMRecordIterator input, String output)
      throws InterruptedException, QualityException, IOException, URISyntaxException {
    outHeader = header.clone();
    outHeader.setSortOrder(SAMFileHeader.SortOrder.coordinate);
    // tmp files
    String tmpOut1 = tmpFileBase + "-p1.bam";
    String tmpOut2 = tmpFileBase + "-p2.bam";
    String tmpOut3 = tmpFileBase + "-p3.sam";
    String fCounts = tmpFileBase + "-features.count";
    String tmpMetrics = tmpFileBase + "-p3-metrics.txt";
    SAMFileWriterFactory factory = new SAMFileWriterFactory();
    if (!inputIsBam) {
      outHeader.addReadGroup(bamrg);
    }
    SAMFileWriter writer = factory.makeBAMWriter(outHeader, true, new File(tmpOut1));

    long startTime = System.currentTimeMillis();

    int count = 0;
    SAMRecord sam;
    while (input.hasNext()) {
      sam = input.next();
      writer.addAlignment(sam);
      count++;
    }
    int reads = input.getCount();
    writer.close();

    context.getCounter(HalvadeCounters.IN_PREP_READS).increment(reads);
    long estimatedTime = System.currentTimeMillis() - startTime;
    context.getCounter(HalvadeCounters.TIME_HADOOP_SAMTOBAM).increment(estimatedTime);
    Logger.DEBUG("time writing " + count + " records to disk: " + estimatedTime / 1000);

    Logger.DEBUG("clean sam");
    context.setStatus("clean sam");
    tools.runCleanSam(tmpOut1, tmpOut2);
    Logger.DEBUG("mark duplicates");
    context.setStatus("mark duplicates");
    tools.runMarkDuplicates(tmpOut2, tmpOut3, tmpMetrics);

    if (gff != null) {
      // tmpOut3 is sam for htseq count!
      Logger.DEBUG("featureCounts");
      context.setStatus("featureCounts");
      tools.runFeatureCounts(gff, tmpOut3, fCounts, threads);
      HalvadeFileUtils.uploadFileToHDFS(
          context,
          FileSystem.get(new URI(outputdir), context.getConfiguration()),
          fCounts,
          outputdir + context.getTaskAttemptID().toString() + ".count");
    }

    if (!inputIsBam) {
      Logger.DEBUG("add read-group");
      context.setStatus("add read-group");
      tools.runAddOrReplaceReadGroups(tmpOut3, output, RGID, RGLB, RGPL, RGPU, RGSM);
    } else {
      context.setStatus("convert SAM to BAM");
      Logger.DEBUG("convert SAM to BAM");
      tools.callSAMToBAM(tmpOut3, output, threads);
    }

    Logger.DEBUG("build bam index");
    context.setStatus("build bam index");
    tools.runBuildBamIndex(output);

    estimatedTime = System.currentTimeMillis() - startTime;
    Logger.DEBUG("estimated time: " + estimatedTime / 1000);

    // remove all temporary files now!
    HalvadeFileUtils.removeLocalFile(keep, tmpMetrics, context, HalvadeCounters.FOUT_GATK_TMP);
    HalvadeFileUtils.removeLocalFile(keep, tmpOut1, context, HalvadeCounters.FOUT_GATK_TMP);
    HalvadeFileUtils.removeLocalFile(keep, tmpOut2, context, HalvadeCounters.FOUT_GATK_TMP);
    HalvadeFileUtils.removeLocalFile(keep, tmpOut3, context, HalvadeCounters.FOUT_GATK_TMP);
    HalvadeFileUtils.removeLocalFile(keep, fCounts);
  }