public void constructMatrix(ArrayList<MiRBaseEntry> mirb, int subnum, String taxo) { ArrayList<TrainingEntry> pos = positiveData(mirb, subnum); trainSet_p = parsePara(pos, taxo); ArrayList<TrainingEntry> neg = negativeData(pos, level); trainSet_n = parsePara(neg, taxo); // report int total = 0; for (TrainingEntry t : pos) total += t.NumOfMi(); for (TrainingEntry t : neg) total += t.NumOfMi(); System.out.println("Total Training Entries: " + total); System.out.println("Level: " + level); }
/** * parse parameters of all entries * * @param in */ private svm_problem parsePara(ArrayList<TrainingEntry> in, String taxo) { int num = 0; for (TrainingEntry t : in) num += t.NumOfMi(); svm_node[][] data = new svm_node[num][]; int i = 0; for (TrainingEntry mi : in) { pri = mi.getPriRNA(); pri.setPriPara(); for (int j = 0; j < mi.NumOfMi(); j++) { pri.process(mi.getStart(j), mi.getSize(j), mi.getStrand(j)); ps = new ParaSummary(pri, pri.getPreRNA(), pri.getMiRNA()); data[i++] = svm.predictMatrix(mi.getLabel(j), ps.paraMatrix(taxo)); } } return svm.trainMatrix(data); }
public ArrayList<TrainingEntry> positiveData(ArrayList<MiRBaseEntry> milist, int subnum) { System.out.print("Reads miRBase data "); ArrayList<TrainingEntry> pos = new ArrayList<TrainingEntry>(); int num = milist.size(); ArrayList<Integer> rmis = randomization(num, num); // randomizate the miRBase entries MfeFold doFold; int evNum = 0; // number of experimental verified miRNAs int uevNum = 0; // number of unexperimental verified miRNAs int positive = 0; // number of positive mirna int n = 0; for (Integer m : rmis) { if (subnum > 0 && positive >= subnum) break; // get subnum mirans MiRBaseEntry mi = milist.get(m); TrainingEntry te = new TrainingEntry(); pri = new PriMiRNA(mi.getPriAccession(), mi.getPriSequence()); // fold pri-miRNA doFold = new MfeFold(pri.getSeq()); doFold.cal(); pri.setStr(doFold.getStructure()); pri.setEnergy(doFold.getEnergy()); te.setPriRNA(pri); // store pri int end5 = pri.getStr().lastIndexOf("(") + 1; // 5' end position of hairpin,count from 1 int start3 = pri.getStr().indexOf(")") + 1; // 3' start position of hairpin,count from 1 for (int i = 0; i < mi.getMatNum(); i++) { // the miRNA should be experimental verified miRNA if (mi.getEvidence(i).equals("experimental")) { evNum += 1; // the miRNA should not have two or more loops if (end5 >= start3) continue; int strand; if (mi.getMatEnd(i) <= end5) strand = 5; // lie in 5' else if (mi.getMatStart(i) >= start3) strand = 3; // lie in 3' else continue; // the miRNA should not lie in loop area te.addMiEntry(mi.getMatStart(i) - 1, mi.getMatEnd(i) - mi.getMatStart(i) + 1, strand, 1); // for the fasta format mirna data // te.setMiId(mi.getMatID(i)); // p_fas.put(te.getMiId(te.NumOfMi()-1), te.getMiSeq(te.NumOfMi()-1)); positive++; } else uevNum += 1; } // add positive data if (te.NumOfMi() > 0) pos.add(te); n++; System.out.print(n + ":" + mi.getPriID() + backspace(n + ":" + mi.getPriID())); } System.out.println(); // report System.out.println("Loads " + (evNum + uevNum) + " Entries"); System.out.println("Experimental verified Entries: " + evNum); System.out.println("Non-experimental verified Entries: " + uevNum); System.out.println("Positive Entries: " + positive); return pos; }
public ArrayList<TrainingEntry> negativeData(ArrayList<TrainingEntry> pos, int level) { Random r = new Random(); // random number producer int rS = 0; ArrayList<TrainingEntry> nes = new ArrayList<TrainingEntry>(); TrainingEntry ne; int strand = 0; int negative = 0; for (TrainingEntry po : pos) { ne = new TrainingEntry(); ne.setPriRNA(po.getPriRNA()); int end5 = po.getPriRNA().getStr().lastIndexOf("("); // from 0 int start3 = po.getPriRNA().getStr().indexOf(")"); // from 0 int endP5 = end5 + (start3 - end5 - 1) / 2; // 5' endpoint of hairpin,count from 0 int endP3 = start3 - (start3 - end5 - 1) / 2; // 3' endpoint of hairpin, count from 0 int num = po.NumOfMi(); // the number of postive mi int size = po.getPriRNA().getLength(); // the size of pri in the entry for (int n = 0; n < level; n++) { // each level for (int i = 0; i < num; i++) { // each miRNA in the entry int miL = po.getSize(i); // the positive mi size HashMap bases = new HashMap(); int flag = 0; while (bases.size() < size) { // the negative mi start rS = r.nextInt(size); // get a random start if (bases.containsKey(rS)) continue; bases.put(rS, null); // store the positions which have been used // the random start is at least 5-bp from the true start positions on the entry int j; for (j = 0; j < num; j++) { if (Math.abs(po.getStart(i) - rS) < 5) break; } if (j < num) continue; // if the random start at 5' strand if ((rS < endP5 && rS + miL - 1 <= endP5)) { strand = 5; flag = 1; break; } // if the random start at 3' strand else if (rS >= endP3 && rS + miL < size) { strand = 3; flag = 1; break; } // else continue; } if (flag == 1) { ne.addMiEntry(rS, miL, strand, 0); // for the fasta format mirna data // ne.setMiId(po.getMiId(i)+"n"+n); // n_fas.put(ne.getMiId(i), ne.getMiSeq(i)); negative++; } else System.out.println("no negative data found on " + po.getPriRNA().getId()); } } nes.add(ne); } System.out.println("Negative Entries: " + negative); return nes; }