Exemplo n.º 1
0
  @Override
  protected void setup(Context context) throws IOException, InterruptedException {
    super.setup(context);
    setupGCBins(context.getConfiguration());

    genomeAdmin = HBaseGenomeAdmin.getHBaseGenomeAdmin(context.getConfiguration());
    variationAdmin = VariationAdmin.getInstance(context.getConfiguration());

    genomeName = context.getConfiguration().get("genome");
    parentGenome = context.getConfiguration().get("parent");
    genome = genomeAdmin.getGenomeTable().getGenome(genomeName);
    if (genome == null) throw new IOException("Genome " + genome + " is missing.");

    try {
      SNVProbability table =
          (SNVProbability) variationAdmin.getTable(VariationTables.SNVP.getTableName());
      snvProbabilities = table.getProbabilities();

      SizeProbability sizeTable =
          (SizeProbability) variationAdmin.getTable(VariationTables.SIZE.getTableName());
      sizeProbabilities = sizeTable.getProbabilities();
      variationList = sizeTable.getVariationList();
      variationList.add("SNV");
    } catch (ProbabilityException e) {
      throw new InterruptedException("Failed to start mapper: " + e);
    }

    varTable = (VariationCountPerBin) variationAdmin.getTable(VariationTables.VPB.getTableName());
  }
Exemplo n.º 2
0
  @Override
  protected void map(ImmutableBytesWritable key, Result value, Context context)
      throws IOException, InterruptedException {
    if (maxGCBins == null || GCBins == null) throw new IOException("GC Bins need to be set up.");

    long start = System.currentTimeMillis();

    final SequenceResult origSeq = genomeAdmin.getSequenceTable().createResult(value);
    final ChromosomeResult origChr =
        genomeAdmin.getChromosomeTable().getChromosome(parentGenome, origSeq.getChr());

    // get hbase objects for new genome
    ChromosomeResult mutatedChr =
        genomeAdmin.getChromosomeTable().getChromosome(genome.getName(), origSeq.getChr());
    if (mutatedChr == null) {
      String rowId =
          genomeAdmin
              .getChromosomeTable()
              .addChromosome(
                  genome, origChr.getChrName(), origChr.getLength(), origChr.getSegmentNumber());
      mutatedChr = genomeAdmin.getChromosomeTable().queryTable(rowId);
    }

    /* --- Mutate sequence --- */
    Random randomFragment = new Random();
    DNASequence mutatedSequence = new DNASequence(origSeq.getSequence());
    /* Don't bother to try and mutate a fragment that is more than 70% 'N' */
    int gcContent = mutatedSequence.calculateGC();
    if (gcContent > (0.3 * origSeq.getSequenceLength())) {
      GCBin.GCResult gcResult = maxGCBins.get(origSeq.getChr());
      if (gcContent < gcResult.getMax()) getBin(origSeq.getChr(), gcContent);

      // get random fragment within this bin
      List<VCPBResult> varsPerFrag =
          varTable.getFragment(
              origSeq.getChr(),
              gcResult.getMin(),
              gcResult.getMax(),
              randomFragment.nextInt(gcResult.getTotalFragments()),
              variationList);

      Map<Variation, Map<Location, DNASequence>> mutations =
          new HashMap<Variation, Map<Location, DNASequence>>();

      // apply the variations to the sequence, each of them needs to apply to the same fragment
      // it is possible that one could override another (e.g. a deletion removes SNVs)
      // TODO need to check that these are ordered these by variation based on the hbase
      // ordering...SNV, del, ins, ...
      for (VCPBResult variation : varsPerFrag) {
        Variation v = createInstance(variation.getVariationClass());
        v.setVariationName(variation.getVariationName());
        if (variation.getVariationName().equals("SNV")) {
          SNV snv = ((SNV) v);
          snv.setSnvFrequencies(snvProbabilities);
        } else v.setSizeVariation(sizeProbabilities.get(variation.getVariationName()));

        mutatedSequence = v.mutateSequence(mutatedSequence, variation.getVariationCount());
        if (v.getLastMutations().size() > 0) mutations.put(v, v.getLastMutations());
      }

      String mutSeqRowId =
          genomeAdmin
              .getSequenceTable()
              .addSequence(
                  mutatedChr,
                  origSeq.getStart(),
                  (origSeq.getStart() + mutatedSequence.getLength()),
                  mutatedSequence.getSequence(),
                  origSeq.getSegmentNum());
      if (mutSeqRowId == null) throw new IOException("Failed to add sequence.");

      // add any mutations to the small mutations table, best done as a batch job
      List<Put> puts = new ArrayList<Put>();
      SequenceResult mutSequence = genomeAdmin.getSequenceTable().queryTable(mutSeqRowId);
      for (Variation v : mutations.keySet()) {
        for (Map.Entry<Location, DNASequence> entry : mutations.get(v).entrySet()) {
          try {
            Row row =
                genomeAdmin
                    .getSmallMutationsTable()
                    .newMutationRow(
                        mutSequence,
                        v,
                        entry.getKey().getStart(),
                        entry.getKey().getEnd(),
                        entry.getValue().getSequence());
            puts.add(new Put(genomeAdmin.getSmallMutationsTable().getPut(row)));
          } catch (IllegalArgumentException ae) {
            log.error("Failed to add " + mutSeqRowId + " var " + v.toString(), ae);
          }
        }
      }
      try {
        genomeAdmin.getSmallMutationsTable().put(puts);
      } catch (IOException ioe) {
        log.error(ioe);
      }
    } else
      genomeAdmin
          .getSequenceTable()
          .addSequence(
              mutatedChr,
              origSeq.getStart(),
              origSeq.getEnd(),
              origSeq.getSequence(),
              origSeq.getSegmentNum());

    long end = System.currentTimeMillis() - start;
    log.debug("FINISHED MAP " + origSeq.getRowId() + " time=" + String.valueOf(end));
  }