private void setupGCBins(Configuration conf) throws IOException { GCBin gcTable = (GCBin) VariationAdmin.getInstance(conf).getTable(VariationTables.GC.getTableName()); GCBins = new HashMap<Location, GCBin.GCResult>(); for (Map.Entry<String, List<GCBin.GCResult>> entry : gcTable.getBins().entrySet()) { for (GCBin.GCResult r : entry.getValue()) GCBins.put(new Location(entry.getKey(), r.getMin(), r.getMax()), r); } maxGCBins = gcTable.getMaxBins(); }
@Override protected void map(ImmutableBytesWritable key, Result value, Context context) throws IOException, InterruptedException { if (maxGCBins == null || GCBins == null) throw new IOException("GC Bins need to be set up."); long start = System.currentTimeMillis(); final SequenceResult origSeq = genomeAdmin.getSequenceTable().createResult(value); final ChromosomeResult origChr = genomeAdmin.getChromosomeTable().getChromosome(parentGenome, origSeq.getChr()); // get hbase objects for new genome ChromosomeResult mutatedChr = genomeAdmin.getChromosomeTable().getChromosome(genome.getName(), origSeq.getChr()); if (mutatedChr == null) { String rowId = genomeAdmin .getChromosomeTable() .addChromosome( genome, origChr.getChrName(), origChr.getLength(), origChr.getSegmentNumber()); mutatedChr = genomeAdmin.getChromosomeTable().queryTable(rowId); } /* --- Mutate sequence --- */ Random randomFragment = new Random(); DNASequence mutatedSequence = new DNASequence(origSeq.getSequence()); /* Don't bother to try and mutate a fragment that is more than 70% 'N' */ int gcContent = mutatedSequence.calculateGC(); if (gcContent > (0.3 * origSeq.getSequenceLength())) { GCBin.GCResult gcResult = maxGCBins.get(origSeq.getChr()); if (gcContent < gcResult.getMax()) getBin(origSeq.getChr(), gcContent); // get random fragment within this bin List<VCPBResult> varsPerFrag = varTable.getFragment( origSeq.getChr(), gcResult.getMin(), gcResult.getMax(), randomFragment.nextInt(gcResult.getTotalFragments()), variationList); Map<Variation, Map<Location, DNASequence>> mutations = new HashMap<Variation, Map<Location, DNASequence>>(); // apply the variations to the sequence, each of them needs to apply to the same fragment // it is possible that one could override another (e.g. a deletion removes SNVs) // TODO need to check that these are ordered these by variation based on the hbase // ordering...SNV, del, ins, ... for (VCPBResult variation : varsPerFrag) { Variation v = createInstance(variation.getVariationClass()); v.setVariationName(variation.getVariationName()); if (variation.getVariationName().equals("SNV")) { SNV snv = ((SNV) v); snv.setSnvFrequencies(snvProbabilities); } else v.setSizeVariation(sizeProbabilities.get(variation.getVariationName())); mutatedSequence = v.mutateSequence(mutatedSequence, variation.getVariationCount()); if (v.getLastMutations().size() > 0) mutations.put(v, v.getLastMutations()); } String mutSeqRowId = genomeAdmin .getSequenceTable() .addSequence( mutatedChr, origSeq.getStart(), (origSeq.getStart() + mutatedSequence.getLength()), mutatedSequence.getSequence(), origSeq.getSegmentNum()); if (mutSeqRowId == null) throw new IOException("Failed to add sequence."); // add any mutations to the small mutations table, best done as a batch job List<Put> puts = new ArrayList<Put>(); SequenceResult mutSequence = genomeAdmin.getSequenceTable().queryTable(mutSeqRowId); for (Variation v : mutations.keySet()) { for (Map.Entry<Location, DNASequence> entry : mutations.get(v).entrySet()) { try { Row row = genomeAdmin .getSmallMutationsTable() .newMutationRow( mutSequence, v, entry.getKey().getStart(), entry.getKey().getEnd(), entry.getValue().getSequence()); puts.add(new Put(genomeAdmin.getSmallMutationsTable().getPut(row))); } catch (IllegalArgumentException ae) { log.error("Failed to add " + mutSeqRowId + " var " + v.toString(), ae); } } } try { genomeAdmin.getSmallMutationsTable().put(puts); } catch (IOException ioe) { log.error(ioe); } } else genomeAdmin .getSequenceTable() .addSequence( mutatedChr, origSeq.getStart(), origSeq.getEnd(), origSeq.getSequence(), origSeq.getSegmentNum()); long end = System.currentTimeMillis() - start; log.debug("FINISHED MAP " + origSeq.getRowId() + " time=" + String.valueOf(end)); }