public String induceRulesWithTestSet(double testSetPercent) throws DataFormatException { SystemVariables.getInstance() .appendToExperimentNotes("\\n* Total number of PSSM matches in positive sequences: "); // System.out.println("Total number of PSSM matches in positive sequences: "); ArrayList<Feature> posTrainingSet = regulatoryElementService.getRegulatoryElements( regulatoryRegionService.getPositiveRegulatoryRegions(), positiveCutOffScore, posATcomposition, posCGcomposition); SystemVariables.getInstance() .appendToExperimentNotes("\\n* Total number of PSSM matches in negative sequences: "); // System.out.println("Total number of PSSM matches in negative sequences: "); ArrayList<Feature> negTrainingSet = regulatoryElementService.getRegulatoryElements( regulatoryRegionService.getNegativeRegulatoryRegions(), negativeCutOffScore, negATcomposition, negCGcomposition); IlpService formatter = new IlpService(tempIlpJobDirName); formatter.createIlpFilesWithTestSet( regulatoryRegionService.getPositiveRegulatoryRegions(), regulatoryRegionService.getNegativeRegulatoryRegions(), posTrainingSet, negTrainingSet, testSetPercent); String ilpTheory = formatter.runILP(); return ilpTheory; }
@Override public void updateNumberOfPositiveRegRegions(int num) throws DataFormatException { // TODO check for empty regions // TODO test this int deltaSeqNum = num - positiveRegRegions.size(); if (deltaSeqNum < 0) { // desired number of positive sequences is less than is available now for (int i = 0; i > deltaSeqNum; i--) { positiveRegRegions.remove(0); } } else { // desired number is more than available -> need to generate extra ArrayList<Feature> extraSimulatedGenes = FeaturesTools.generateSimulatedRegulatoryRegionsWithPositionalPSSMs( deltaSeqNum, regulatorySequenceLength, sequenceNtProbabilities, SystemVariables.getInstance().getPositivePatserCutOffScore(), POSITIVE_REGION_PREFIX, modelRegRegionService.getPositiveRegulatoryRegions(), modelRegElService); // generateRegulatoryRegions(deltaSeqNum, POSITIVE_REGION_PREFIX); negativeRegRegions.addAll(extraSimulatedGenes); } }
public IlpService createIlpFiles() throws DataFormatException { ArrayList<Feature> posRegElements = regulatoryElementService.getRegulatoryElements( regulatoryRegionService.getPositiveRegulatoryRegions(), positiveCutOffScore, posATcomposition, posCGcomposition); if (posRegElements == null) { posRegElements = regulatoryElementService.getRegulatoryElements( regulatoryRegionService.getPositiveRegulatoryRegions(), regulatoryRegionService.getNegativeRegulatoryRegions(), positiveCutOffScore); } // ***** Add statistics addMotifMatchingStatisticsToNotes("positive"); SystemVariables.getInstance().setPosSeqRelElMatchesNum(posRegElements.size()); ArrayList<Feature> negRegElements = regulatoryElementService.getRegulatoryElements( regulatoryRegionService.getNegativeRegulatoryRegions(), negativeCutOffScore, negATcomposition, negCGcomposition); if (negRegElements == null) { negRegElements = regulatoryElementService.getRegulatoryElements( regulatoryRegionService.getNegativeRegulatoryRegions(), null, negativeCutOffScore); } // ***** Add statistics addMotifMatchingStatisticsToNotes("negative"); SystemVariables.getInstance().setNegSeqRelElMatchesNum(negRegElements.size()); /* System.out.println("\n Number of matches of pairs of matrices in jurkat and erythroid sequences"); String[] pwmPair = new String[]{"MA0055.1", "MA0152.1"}; System.out.println(pwmPair[0] + " and " + pwmPair[1]+" :\t" + FeaturesTools.numSequencesContainingAll(pwmPair, posRegElements) + "\t"+ FeaturesTools.numSequencesContainingAll(pwmPair, negRegElements)); System.out.println("\nDistances between Gata and Ebox for positive sequences:"); FeaturesTools.printRegElDistances("Gata", "Ebox", posRegElements); */ IlpService ilpService = new IlpService(tempIlpJobDirName); ilpService.createIlpFiles( regulatoryRegionService.getPositiveRegulatoryRegions(), regulatoryRegionService.getNegativeRegulatoryRegions(), posRegElements, negRegElements); return ilpService; }
public SyntheticRegRegionService(int numOfPositiveRegRegions, int negExMultiplicationFactor) throws DataFormatException { modelRegRegionService = new CElegansRegRegionService(0); File pwmDir = new File(SystemVariables.getInstance().getString("C.elegans.PWMs.dir")); String tmpJobDir = SystemVariables.getInstance() .getString("temp.output.dir"); // TODO create a proper tmp job dir for experiments modelRegElService = new PatserRegElementService(pwmDir, tmpJobDir); if (modelRegRegionService.getPositiveRegulatoryRegions().size() > 0) { regulatorySequenceLength = modelRegRegionService.getPositiveRegulatoryRegions().get(0).getSequence().length(); } else { throw new DataFormatException( "Model regulatory regions are empty. Can not model synthetic regions based on an empty regions."); } // TODO: remove - just for testing regulatorySequenceLength = 25; negativeRegRegions = FeaturesTools.generateSimulatedRegulatoryRegions( negExMultiplicationFactor * numOfPositiveRegRegions, regulatorySequenceLength, NEGATIVE_REGION_PREFIX, sequenceNtProbabilities); positiveRegRegions = FeaturesTools.generateSimulatedRegulatoryRegionsWithPositionalPSSMs( numOfPositiveRegRegions, regulatorySequenceLength, sequenceNtProbabilities, SystemVariables.getInstance().getPositivePatserCutOffScore(), POSITIVE_REGION_PREFIX, modelRegRegionService.getPositiveRegulatoryRegions(), modelRegElService); // this.generateRegulatoryRegions(numOfPositiveRegRegions, ); }
/* Note: unsound method * Based on the statistics, collected when extracting motif matches by RegElementService, * collects text note of motif matching statistics, which will be added to ILP file. * This method has to be called immediately after RegElementService.getRegulatoryElements call, * since every such call overrides the pssmMatchStatistics in RegElementService * * @param sequenceKind - can only be "positive" or "negative", depending on the kind of sequences */ private void addMotifMatchingStatisticsToNotes(String sequenceKind) throws DataFormatException { Hashtable<String, Double> pssmMatchStats = regulatoryElementService.getPssmMatchingStatistics(); if (pssmMatchStats != null && !pssmMatchStats.isEmpty()) { SystemVariables.getInstance() .appendToExperimentNotes( "\\n* Number of PSSM matches in " + sequenceKind + " sequences: "); int numberOfRegions = 0; if ("positive".equals(sequenceKind)) { numberOfRegions = regulatoryRegionService.getPositiveRegulatoryRegions().size(); } else { numberOfRegions = regulatoryRegionService.getNegativeRegulatoryRegions().size(); } Enumeration<String> pssmMatchNames = pssmMatchStats.keys(); int totalNumMatches = 0; String r_pssmNames = ""; String r_matches = ""; while (pssmMatchNames.hasMoreElements()) { String pssmName = pssmMatchNames.nextElement(); double pssmStat = pssmMatchStats.get(pssmName); int numMatches = (int) (pssmStat * numberOfRegions); totalNumMatches = totalNumMatches + numMatches; r_pssmNames = r_pssmNames + "\'" + pssmName + "\', "; r_matches = r_matches + numMatches + ", "; SystemVariables.getInstance() .appendToExperimentNotes("\\n\\t" + pssmName + "\\t" + numMatches); } SystemVariables.getInstance() .appendToExperimentNotes("\\n Total number of matches: " + totalNumMatches); // System.out.println("In positive sequences: "); // System.out.println(r_pssmNames); // System.out.println(r_matches); } }
/* Generates regulatory regions for ILP positive examples. * Base of the sequence is random ACGT 0.25 : 0.25 : 0.25 : 0.25 composition, * cut-off score for finding PSSMs in model sequences is a default positive cut-off * C.elegans pwm are planted in the base sequence at random positions. * * @param int number of regions to be generates * @param String generic prefix for the name of regulatory sequence */ private ArrayList<Feature> generateRegulatoryRegions(int numOfRegions, String regionNamePrefix) throws DataFormatException { ArrayList<Feature> regRegions = new ArrayList<Feature>(); if (numOfRegions <= 0) { return regRegions; } // Generate probabilities based on C.elegans data // RegulatoryRegionService regRegionService = // RegulatoryRegionServiceFactory.getService(RegulatoryRegionService.C_ELEGANS_DATA_SERVICE, 0, // 0); ArrayList<Feature> cElegansRegRegions = modelRegRegionService.getPositiveRegulatoryRegions(); if (cElegansRegRegions == null) { throw new DataFormatException( "SyntheticRegRegionService: Unable to retrieve C.elegans regulatory regions to generate positive sequences."); } File pwmDir = new File(SystemVariables.getInstance().getString("C.elegans.PWMs.dir")); String tmpJobDir = SystemVariables.getInstance() .getString("temp.output.dir"); // TODO create a proper tmp job dir for experiments PatserRegElementService regElService = new PatserRegElementService(pwmDir, tmpJobDir); double cutOffScore = SystemVariables.getInstance().getPositivePatserCutOffScore(); double atComposition = sequenceNtProbabilities[0] + sequenceNtProbabilities[3]; double cgComposition = sequenceNtProbabilities[1] + sequenceNtProbabilities[2]; ArrayList<Feature> regElements = regElService.getRegulatoryElements( cElegansRegRegions, cutOffScore, atComposition, cgComposition); ArrayList<RegulatoryElementPWM> regElementPWMs = regElService.getRegulatoryElementsPWMs(); StatAnalyser stat = new StatAnalyser(regElements); for (int i = 0; i < numOfRegions; i++) { String sequence = generatePosExSequence( stat, regElementPWMs, cElegansRegRegions.get(0).getSequence().length(), cElegansRegRegions.size()); regRegions.add(new Feature(regionNamePrefix + i, "gene", null, 0, 0, sequence, 0.0)); } return regRegions; }
public Explorer( RegulatoryRegionService regRegionService, RegulatoryElementService regElService, String tempIlpJobDirName) throws DataFormatException { this.regulatoryRegionService = regRegionService; this.regulatoryElementService = regElService; this.tempIlpJobDirName = tempIlpJobDirName; positiveCutOffScore = SystemVariables.getInstance().getPositivePatserCutOffScore(); negativeCutOffScore = SystemVariables.getInstance().getNegativePatserCutOffScore(); /////// Statistics SystemVariables.getInstance().cleanStatistics(); // clean old; SystemVariables.getInstance() .setPosSeqNum(regulatoryRegionService.getPositiveRegulatoryRegions().size()); SystemVariables.getInstance() .setNegSeqNum(regulatoryRegionService.getNegativeRegulatoryRegions().size()); SystemVariables.getInstance() .appendToExperimentNotes( "\\n* Number of positive sequences: " + regulatoryRegionService.getPositiveRegulatoryRegions().size()); SystemVariables.getInstance() .appendToExperimentNotes( "\\n* Number of negative sequences: " + regulatoryRegionService.getNegativeRegulatoryRegions().size()); SystemVariables.getInstance() .appendToExperimentNotes( "\\n* Patser cut-off score for positive sequences: " + positiveCutOffScore); SystemVariables.getInstance() .appendToExperimentNotes( "\\n* Patser cut-off score for negative sequences: " + negativeCutOffScore); double[] posNtComposition = FeaturesTools.getNucleotideComposition( regulatoryRegionService.getPositiveRegulatoryRegions()); posATcomposition = posNtComposition[0] + posNtComposition[3]; posCGcomposition = posNtComposition[1] + posNtComposition[2]; SystemVariables.getInstance() .appendToExperimentNotes( "\\n* A:T and C:G composition of positive sequences: " + posATcomposition + " " + posCGcomposition); SystemVariables.getInstance().setPosATcomposition(posATcomposition); SystemVariables.getInstance().setPosCGcomposition(posCGcomposition); double[] negNtComposition = FeaturesTools.getNucleotideComposition( regulatoryRegionService.getNegativeRegulatoryRegions()); negATcomposition = negNtComposition[0] + negNtComposition[3]; negCGcomposition = negNtComposition[1] + negNtComposition[2]; SystemVariables.getInstance() .appendToExperimentNotes( "\\n* A:T and C:G composition of negative sequences: " + negATcomposition + " " + negCGcomposition); SystemVariables.getInstance().setNegATcomposition(negATcomposition); SystemVariables.getInstance().setNegCGcomposition(negCGcomposition); }